diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5524 @@ +{ + "best_metric": 0.7965201735496521, + "best_model_checkpoint": "lora-Vicuna/checkpoint-1000", + "epoch": 2.9878273699741795, + "global_step": 16200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.041, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5971, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.2446, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 1.1737, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.1234, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002996287817855596, + "loss": 1.0991, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992575635711192, + "loss": 1.089, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002988863453566788, + "loss": 1.0685, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002985151271422384, + "loss": 1.0466, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029814390892779805, + "loss": 1.0441, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.8649595379829407, + "eval_runtime": 1.3492, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.741, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002977726907133576, + "loss": 1.0417, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029740147249891725, + "loss": 1.0269, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002970302542844769, + "loss": 1.015, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002966590360700365, + "loss": 1.0089, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002962878178555961, + "loss": 1.0269, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002959165996411557, + "loss": 1.0187, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029554538142671533, + "loss": 1.0142, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002951741632122749, + "loss": 1.0077, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029480294499783453, + "loss": 1.004, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029443172678339416, + "loss": 1.0042, + "step": 400 + }, + { + "epoch": 0.07, + "eval_loss": 0.8426274061203003, + "eval_runtime": 1.3288, + "eval_samples_per_second": 0.753, + "eval_steps_per_second": 0.753, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029406050856895373, + "loss": 1.0061, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029368929035451335, + "loss": 1.0152, + "step": 440 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293318072140073, + "loss": 1.0049, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002929468539256326, + "loss": 1.0011, + "step": 480 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029257563571119223, + "loss": 0.9932, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002922044174967518, + "loss": 0.9926, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029183319928231143, + "loss": 0.9971, + "step": 540 + }, + { + "epoch": 0.1, + "learning_rate": 0.000291461981067871, + "loss": 0.9866, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029109076285343063, + "loss": 0.9871, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029071954463899026, + "loss": 0.9906, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.8352808356285095, + "eval_runtime": 1.3169, + "eval_samples_per_second": 0.759, + "eval_steps_per_second": 0.759, + "step": 600 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903483264245499, + "loss": 0.9797, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899771082101095, + "loss": 0.989, + "step": 640 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002896058899956691, + "loss": 0.9931, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002892346717812287, + "loss": 0.9859, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028886345356678834, + "loss": 0.9874, + "step": 700 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884922353523479, + "loss": 0.9888, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028812101713790754, + "loss": 0.9798, + "step": 740 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028774979892346716, + "loss": 0.981, + "step": 760 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028737858070902674, + "loss": 0.9824, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028700736249458636, + "loss": 0.9922, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8162809610366821, + "eval_runtime": 1.2603, + "eval_samples_per_second": 0.793, + "eval_steps_per_second": 0.793, + "step": 800 + }, + { + "epoch": 0.15, + "learning_rate": 0.000286636144280146, + "loss": 0.9942, + "step": 820 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002862649260657056, + "loss": 0.9878, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028589370785126524, + "loss": 0.9855, + "step": 860 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002855224896368248, + "loss": 0.9803, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028515127142238444, + "loss": 0.9793, + "step": 900 + }, + { + "epoch": 0.17, + "learning_rate": 0.000284780053207944, + "loss": 0.9939, + "step": 920 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028440883499350364, + "loss": 0.9806, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028403761677906327, + "loss": 0.9864, + "step": 960 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002836663985646229, + "loss": 0.9839, + "step": 980 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002832951803501825, + "loss": 0.9816, + "step": 1000 + }, + { + "epoch": 0.18, + "eval_loss": 0.7965201735496521, + "eval_runtime": 1.3017, + "eval_samples_per_second": 0.768, + "eval_steps_per_second": 0.768, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002829239621357421, + "loss": 0.9848, + "step": 1020 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002825527439213017, + "loss": 0.9908, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028218152570686134, + "loss": 0.9885, + "step": 1060 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818103074924209, + "loss": 0.9772, + "step": 1080 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028143908927798054, + "loss": 0.9948, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028106787106354017, + "loss": 0.9778, + "step": 1120 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028069665284909974, + "loss": 0.9909, + "step": 1140 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028032543463465937, + "loss": 0.9802, + "step": 1160 + }, + { + "epoch": 0.22, + "learning_rate": 0.000279954216420219, + "loss": 0.9849, + "step": 1180 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795829982057786, + "loss": 0.9788, + "step": 1200 + }, + { + "epoch": 0.22, + "eval_loss": 0.8706400990486145, + "eval_runtime": 1.3141, + "eval_samples_per_second": 0.761, + "eval_steps_per_second": 0.761, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027921177999133825, + "loss": 0.9766, + "step": 1220 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002788405617768978, + "loss": 0.9853, + "step": 1240 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027846934356245745, + "loss": 0.9814, + "step": 1260 + }, + { + "epoch": 0.24, + "learning_rate": 0.000278098125348017, + "loss": 0.9747, + "step": 1280 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027772690713357665, + "loss": 0.9715, + "step": 1300 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002773556889191363, + "loss": 0.9903, + "step": 1320 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002769844707046959, + "loss": 0.9862, + "step": 1340 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002766132524902555, + "loss": 0.9791, + "step": 1360 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002762420342758151, + "loss": 0.9834, + "step": 1380 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002758708160613747, + "loss": 0.9752, + "step": 1400 + }, + { + "epoch": 0.26, + "eval_loss": 0.8170408010482788, + "eval_runtime": 1.2722, + "eval_samples_per_second": 0.786, + "eval_steps_per_second": 0.786, + "step": 1400 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027549959784693435, + "loss": 0.9803, + "step": 1420 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002751283796324939, + "loss": 0.9968, + "step": 1440 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027475716141805355, + "loss": 0.98, + "step": 1460 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002743859432036132, + "loss": 0.9819, + "step": 1480 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027401472498917275, + "loss": 0.9855, + "step": 1500 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002736435067747324, + "loss": 0.9852, + "step": 1520 + }, + { + "epoch": 0.28, + "learning_rate": 0.000273272288560292, + "loss": 0.977, + "step": 1540 + }, + { + "epoch": 0.29, + "learning_rate": 0.00027290107034585163, + "loss": 0.9918, + "step": 1560 + }, + { + "epoch": 0.29, + "learning_rate": 0.00027252985213141126, + "loss": 0.9965, + "step": 1580 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027215863391697083, + "loss": 0.9721, + "step": 1600 + }, + { + "epoch": 0.3, + "eval_loss": 0.8074629902839661, + "eval_runtime": 1.2561, + "eval_samples_per_second": 0.796, + "eval_steps_per_second": 0.796, + "step": 1600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027178741570253045, + "loss": 0.984, + "step": 1620 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714533193095341, + "loss": 0.9858, + "step": 1640 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002711006620058157, + "loss": 1.011, + "step": 1660 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027072944379137533, + "loss": 0.9876, + "step": 1680 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027035822557693495, + "loss": 0.9941, + "step": 1700 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699870073624946, + "loss": 0.9935, + "step": 1720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026961578914805415, + "loss": 0.9818, + "step": 1740 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002692445709336138, + "loss": 0.9876, + "step": 1760 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002688733527191734, + "loss": 0.9628, + "step": 1780 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026850213450473303, + "loss": 0.983, + "step": 1800 + }, + { + "epoch": 0.33, + "eval_loss": 0.8400484919548035, + "eval_runtime": 1.261, + "eval_samples_per_second": 0.793, + "eval_steps_per_second": 0.793, + "step": 1800 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002681309162902926, + "loss": 0.9845, + "step": 1820 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026775969807585223, + "loss": 0.98, + "step": 1840 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002673884798614118, + "loss": 0.9649, + "step": 1860 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026701726164697143, + "loss": 0.9847, + "step": 1880 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026664604343253106, + "loss": 0.9824, + "step": 1900 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002662748252180907, + "loss": 0.9777, + "step": 1920 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002659036070036503, + "loss": 0.9781, + "step": 1940 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026553238878920994, + "loss": 0.9795, + "step": 1960 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002651611705747695, + "loss": 0.9875, + "step": 1980 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026478995236032914, + "loss": 0.9741, + "step": 2000 + }, + { + "epoch": 0.37, + "eval_loss": 0.8619368672370911, + "eval_runtime": 1.2019, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 2000 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002644187341458887, + "loss": 0.9841, + "step": 2020 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026404751593144834, + "loss": 0.9813, + "step": 2040 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026367629771700796, + "loss": 0.9777, + "step": 2060 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002633050795025676, + "loss": 0.972, + "step": 2080 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026293386128812716, + "loss": 0.97, + "step": 2100 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002625626430736868, + "loss": 0.9866, + "step": 2120 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002621914248592464, + "loss": 0.9742, + "step": 2140 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026182020664480604, + "loss": 0.9843, + "step": 2160 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002614489884303656, + "loss": 0.9616, + "step": 2180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026107777021592524, + "loss": 0.9813, + "step": 2200 + }, + { + "epoch": 0.41, + "eval_loss": 0.8606493473052979, + "eval_runtime": 1.2145, + "eval_samples_per_second": 0.823, + "eval_steps_per_second": 0.823, + "step": 2200 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002607065520014848, + "loss": 0.9716, + "step": 2220 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026033533378704444, + "loss": 0.9822, + "step": 2240 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025996411557260407, + "loss": 0.9857, + "step": 2260 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002595928973581637, + "loss": 0.9977, + "step": 2280 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002592216791437233, + "loss": 0.9715, + "step": 2300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025885046092928294, + "loss": 0.9623, + "step": 2320 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002584792427148425, + "loss": 0.9637, + "step": 2340 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025810802450040214, + "loss": 0.978, + "step": 2360 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002577368062859617, + "loss": 0.9802, + "step": 2380 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025736558807152134, + "loss": 0.9807, + "step": 2400 + }, + { + "epoch": 0.44, + "eval_loss": 0.8513050079345703, + "eval_runtime": 1.2201, + "eval_samples_per_second": 0.82, + "eval_steps_per_second": 0.82, + "step": 2400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025699436985708097, + "loss": 0.9841, + "step": 2420 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566231516426406, + "loss": 0.9714, + "step": 2440 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025625193342820017, + "loss": 0.9693, + "step": 2460 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002558807152137598, + "loss": 0.9763, + "step": 2480 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002555094969993194, + "loss": 0.9779, + "step": 2500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025513827878487905, + "loss": 0.9736, + "step": 2520 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547670605704386, + "loss": 0.9788, + "step": 2540 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025439584235599825, + "loss": 0.9789, + "step": 2560 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002540246241415578, + "loss": 0.9729, + "step": 2580 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025365340592711745, + "loss": 0.9752, + "step": 2600 + }, + { + "epoch": 0.48, + "eval_loss": 0.8586015105247498, + "eval_runtime": 1.2961, + "eval_samples_per_second": 0.772, + "eval_steps_per_second": 0.772, + "step": 2600 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025328218771267707, + "loss": 0.9734, + "step": 2620 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529109694982367, + "loss": 0.9767, + "step": 2640 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002525397512837963, + "loss": 0.962, + "step": 2660 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025216853306935595, + "loss": 0.9706, + "step": 2680 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002517973148549155, + "loss": 0.975, + "step": 2700 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025142609664047515, + "loss": 0.9653, + "step": 2720 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510548784260347, + "loss": 0.976, + "step": 2740 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025068366021159435, + "loss": 0.9768, + "step": 2760 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250312441997154, + "loss": 0.9764, + "step": 2780 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002499412237827136, + "loss": 0.9741, + "step": 2800 + }, + { + "epoch": 0.52, + "eval_loss": 0.8531299829483032, + "eval_runtime": 1.2788, + "eval_samples_per_second": 0.782, + "eval_steps_per_second": 0.782, + "step": 2800 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002495700055682732, + "loss": 0.9786, + "step": 2820 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002491987873538328, + "loss": 0.9737, + "step": 2840 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024882756913939243, + "loss": 0.9648, + "step": 2860 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024845635092495206, + "loss": 0.9717, + "step": 2880 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024808513271051163, + "loss": 0.9706, + "step": 2900 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024771391449607125, + "loss": 0.9662, + "step": 2920 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002473426962816308, + "loss": 0.9719, + "step": 2940 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024697147806719045, + "loss": 0.9729, + "step": 2960 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002466002598527501, + "loss": 0.9679, + "step": 2980 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002462290416383097, + "loss": 0.9782, + "step": 3000 + }, + { + "epoch": 0.55, + "eval_loss": 0.8357999324798584, + "eval_runtime": 1.3862, + "eval_samples_per_second": 0.721, + "eval_steps_per_second": 0.721, + "step": 3000 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024585782342386933, + "loss": 0.9636, + "step": 3020 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024548660520942896, + "loss": 0.9779, + "step": 3040 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024511538699498853, + "loss": 0.9718, + "step": 3060 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024474416878054816, + "loss": 0.9756, + "step": 3080 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024437295056610773, + "loss": 0.983, + "step": 3100 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024400173235166736, + "loss": 0.9735, + "step": 3120 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024363051413722698, + "loss": 0.976, + "step": 3140 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002432592959227866, + "loss": 0.9691, + "step": 3160 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002428880777083462, + "loss": 0.9644, + "step": 3180 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002425168594939058, + "loss": 0.9768, + "step": 3200 + }, + { + "epoch": 0.59, + "eval_loss": 0.8302448987960815, + "eval_runtime": 1.362, + "eval_samples_per_second": 0.734, + "eval_steps_per_second": 0.734, + "step": 3200 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002421456412794654, + "loss": 0.9709, + "step": 3220 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024177442306502504, + "loss": 0.9696, + "step": 3240 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024140320485058466, + "loss": 0.9669, + "step": 3260 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024103198663614426, + "loss": 0.965, + "step": 3280 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406607684217039, + "loss": 0.9757, + "step": 3300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024028955020726346, + "loss": 0.9717, + "step": 3320 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002399183319928231, + "loss": 0.963, + "step": 3340 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023954711377838271, + "loss": 0.9519, + "step": 3360 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002391758955639423, + "loss": 0.964, + "step": 3380 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023880467734950194, + "loss": 0.967, + "step": 3400 + }, + { + "epoch": 0.63, + "eval_loss": 0.87740159034729, + "eval_runtime": 1.4495, + "eval_samples_per_second": 0.69, + "eval_steps_per_second": 0.69, + "step": 3400 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023843345913506157, + "loss": 0.9666, + "step": 3420 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023806224092062114, + "loss": 0.9616, + "step": 3440 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023769102270618077, + "loss": 0.9663, + "step": 3460 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023731980449174036, + "loss": 0.9665, + "step": 3480 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002369485862773, + "loss": 0.9662, + "step": 3500 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023657736806285962, + "loss": 0.9656, + "step": 3520 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023620614984841922, + "loss": 0.9587, + "step": 3540 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023583493163397882, + "loss": 0.9639, + "step": 3560 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023546371341953842, + "loss": 0.9727, + "step": 3580 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023509249520509804, + "loss": 0.954, + "step": 3600 + }, + { + "epoch": 0.66, + "eval_loss": 0.8274036645889282, + "eval_runtime": 1.399, + "eval_samples_per_second": 0.715, + "eval_steps_per_second": 0.715, + "step": 3600 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023472127699065767, + "loss": 0.9614, + "step": 3620 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023435005877621727, + "loss": 0.9573, + "step": 3640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002339788405617769, + "loss": 0.9606, + "step": 3660 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023360762234733647, + "loss": 0.962, + "step": 3680 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002332364041328961, + "loss": 0.9591, + "step": 3700 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023286518591845572, + "loss": 0.9547, + "step": 3720 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023249396770401532, + "loss": 0.9627, + "step": 3740 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023212274948957495, + "loss": 0.9578, + "step": 3760 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023175153127513457, + "loss": 0.9623, + "step": 3780 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023138031306069415, + "loss": 0.946, + "step": 3800 + }, + { + "epoch": 0.7, + "eval_loss": 0.8205245137214661, + "eval_runtime": 1.3808, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.724, + "step": 3800 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023100909484625377, + "loss": 0.9734, + "step": 3820 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023063787663181337, + "loss": 0.964, + "step": 3840 + }, + { + "epoch": 0.71, + "learning_rate": 0.000230266658417373, + "loss": 0.9576, + "step": 3860 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022989544020293263, + "loss": 0.9512, + "step": 3880 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022952422198849222, + "loss": 0.9555, + "step": 3900 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022915300377405182, + "loss": 0.95, + "step": 3920 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022878178555961142, + "loss": 0.9617, + "step": 3940 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022841056734517105, + "loss": 0.9636, + "step": 3960 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022803934913073068, + "loss": 0.955, + "step": 3980 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022766813091629028, + "loss": 0.9516, + "step": 4000 + }, + { + "epoch": 0.74, + "eval_loss": 0.8116470575332642, + "eval_runtime": 1.3156, + "eval_samples_per_second": 0.76, + "eval_steps_per_second": 0.76, + "step": 4000 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002272969127018499, + "loss": 0.9517, + "step": 4020 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022692569448740948, + "loss": 0.9551, + "step": 4040 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002265544762729691, + "loss": 0.9578, + "step": 4060 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022618325805852873, + "loss": 0.9613, + "step": 4080 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022581203984408833, + "loss": 0.9526, + "step": 4100 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022544082162964795, + "loss": 0.966, + "step": 4120 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022506960341520755, + "loss": 0.9504, + "step": 4140 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022469838520076715, + "loss": 0.9458, + "step": 4160 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022432716698632678, + "loss": 0.9693, + "step": 4180 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022395594877188638, + "loss": 0.9566, + "step": 4200 + }, + { + "epoch": 0.77, + "eval_loss": 0.8783697485923767, + "eval_runtime": 1.3242, + "eval_samples_per_second": 0.755, + "eval_steps_per_second": 0.755, + "step": 4200 + }, + { + "epoch": 0.78, + "learning_rate": 0.000223584730557446, + "loss": 0.9736, + "step": 4220 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002232135123430056, + "loss": 0.9454, + "step": 4240 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022284229412856523, + "loss": 0.9547, + "step": 4260 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022247107591412483, + "loss": 0.9521, + "step": 4280 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209985769968443, + "loss": 0.9546, + "step": 4300 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022172863948524406, + "loss": 0.9557, + "step": 4320 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022135742127080366, + "loss": 0.947, + "step": 4340 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022098620305636328, + "loss": 0.9576, + "step": 4360 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002206149848419229, + "loss": 0.9506, + "step": 4380 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022024376662748248, + "loss": 0.9537, + "step": 4400 + }, + { + "epoch": 0.81, + "eval_loss": 0.8563671112060547, + "eval_runtime": 1.3666, + "eval_samples_per_second": 0.732, + "eval_steps_per_second": 0.732, + "step": 4400 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002198725484130421, + "loss": 0.955, + "step": 4420 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002195013301986017, + "loss": 0.9653, + "step": 4440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021913011198416133, + "loss": 0.9601, + "step": 4460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021875889376972096, + "loss": 0.9589, + "step": 4480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021838767555528056, + "loss": 0.9448, + "step": 4500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021801645734084016, + "loss": 0.9538, + "step": 4520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021764523912639976, + "loss": 0.958, + "step": 4540 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002172740209119594, + "loss": 0.9466, + "step": 4560 + }, + { + "epoch": 0.84, + "learning_rate": 0.000216902802697519, + "loss": 0.9544, + "step": 4580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002165315844830786, + "loss": 0.9456, + "step": 4600 + }, + { + "epoch": 0.85, + "eval_loss": 0.8733828663825989, + "eval_runtime": 1.3225, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 4600 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021616036626863824, + "loss": 0.956, + "step": 4620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002157891480541978, + "loss": 0.9614, + "step": 4640 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021541792983975744, + "loss": 0.9568, + "step": 4660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021504671162531706, + "loss": 0.9606, + "step": 4680 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021467549341087666, + "loss": 0.9375, + "step": 4700 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143042751964363, + "loss": 0.938, + "step": 4720 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021393305698199592, + "loss": 0.9647, + "step": 4740 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002135618387675555, + "loss": 0.96, + "step": 4760 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021319062055311512, + "loss": 0.9458, + "step": 4780 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021281940233867472, + "loss": 0.9492, + "step": 4800 + }, + { + "epoch": 0.89, + "eval_loss": 0.869820773601532, + "eval_runtime": 1.3761, + "eval_samples_per_second": 0.727, + "eval_steps_per_second": 0.727, + "step": 4800 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021244818412423434, + "loss": 0.9536, + "step": 4820 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021207696590979397, + "loss": 0.9522, + "step": 4840 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021170574769535357, + "loss": 0.9501, + "step": 4860 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021133452948091317, + "loss": 0.958, + "step": 4880 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021096331126647277, + "loss": 0.9585, + "step": 4900 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105920930520324, + "loss": 0.9529, + "step": 4920 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021022087483759202, + "loss": 0.9463, + "step": 4940 + }, + { + "epoch": 0.91, + "learning_rate": 0.00020984965662315162, + "loss": 0.9558, + "step": 4960 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020947843840871125, + "loss": 0.9476, + "step": 4980 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020910722019427087, + "loss": 0.9423, + "step": 5000 + }, + { + "epoch": 0.92, + "eval_loss": 0.8774290084838867, + "eval_runtime": 1.3657, + "eval_samples_per_second": 0.732, + "eval_steps_per_second": 0.732, + "step": 5000 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020873600197983045, + "loss": 0.9463, + "step": 5020 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020836478376539007, + "loss": 0.9389, + "step": 5040 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020799356555094967, + "loss": 0.9443, + "step": 5060 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002076223473365093, + "loss": 0.9412, + "step": 5080 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020725112912206892, + "loss": 0.9417, + "step": 5100 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020687991090762852, + "loss": 0.9485, + "step": 5120 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020650869269318812, + "loss": 0.9479, + "step": 5140 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020613747447874772, + "loss": 0.9541, + "step": 5160 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020576625626430735, + "loss": 0.9472, + "step": 5180 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020539503804986698, + "loss": 0.9382, + "step": 5200 + }, + { + "epoch": 0.96, + "eval_loss": 0.9030184745788574, + "eval_runtime": 1.4724, + "eval_samples_per_second": 0.679, + "eval_steps_per_second": 0.679, + "step": 5200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020502381983542658, + "loss": 0.9323, + "step": 5220 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002046526016209862, + "loss": 0.9435, + "step": 5240 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020428138340654577, + "loss": 0.9417, + "step": 5260 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039101651921054, + "loss": 0.9479, + "step": 5280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020353894697766503, + "loss": 0.9418, + "step": 5300 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020316772876322463, + "loss": 0.946, + "step": 5320 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020279651054878425, + "loss": 0.9475, + "step": 5340 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242529233434388, + "loss": 0.94, + "step": 5360 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020205407411990345, + "loss": 0.943, + "step": 5380 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020168285590546308, + "loss": 0.9358, + "step": 5400 + }, + { + "epoch": 1.0, + "eval_loss": 0.8716119527816772, + "eval_runtime": 1.3296, + "eval_samples_per_second": 0.752, + "eval_steps_per_second": 0.752, + "step": 5400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020131163769102268, + "loss": 0.9351, + "step": 5420 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002009404194765823, + "loss": 0.9412, + "step": 5440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020056920126214193, + "loss": 0.9346, + "step": 5460 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020019798304770153, + "loss": 0.9427, + "step": 5480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019982676483326113, + "loss": 0.9553, + "step": 5500 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019945554661882073, + "loss": 0.9443, + "step": 5520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019908432840438036, + "loss": 0.9446, + "step": 5540 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019871311018993998, + "loss": 0.9319, + "step": 5560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019834189197549958, + "loss": 0.9333, + "step": 5580 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001979706737610592, + "loss": 0.9405, + "step": 5600 + }, + { + "epoch": 1.03, + "eval_loss": 0.8719614744186401, + "eval_runtime": 1.2911, + "eval_samples_per_second": 0.775, + "eval_steps_per_second": 0.775, + "step": 5600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019759945554661878, + "loss": 0.9367, + "step": 5620 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001972282373321784, + "loss": 0.9352, + "step": 5640 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019685701911773803, + "loss": 0.9422, + "step": 5660 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019648580090329763, + "loss": 0.9543, + "step": 5680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019611458268885726, + "loss": 0.9308, + "step": 5700 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001957433644744169, + "loss": 0.9471, + "step": 5720 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019537214625997646, + "loss": 0.933, + "step": 5740 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019500092804553609, + "loss": 0.9475, + "step": 5760 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019462970983109569, + "loss": 0.9368, + "step": 5780 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001942584916166553, + "loss": 0.945, + "step": 5800 + }, + { + "epoch": 1.07, + "eval_loss": 0.9158815145492554, + "eval_runtime": 1.3387, + "eval_samples_per_second": 0.747, + "eval_steps_per_second": 0.747, + "step": 5800 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019388727340221494, + "loss": 0.9407, + "step": 5820 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019351605518777454, + "loss": 0.9345, + "step": 5840 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019314483697333414, + "loss": 0.9355, + "step": 5860 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019277361875889374, + "loss": 0.9368, + "step": 5880 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019240240054445336, + "loss": 0.9339, + "step": 5900 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192031182330013, + "loss": 0.9315, + "step": 5920 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001916599641155726, + "loss": 0.9382, + "step": 5940 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019128874590113222, + "loss": 0.9391, + "step": 5960 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001909175276866918, + "loss": 0.9324, + "step": 5980 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019054630947225142, + "loss": 0.9346, + "step": 6000 + }, + { + "epoch": 1.11, + "eval_loss": 0.8685809969902039, + "eval_runtime": 1.2797, + "eval_samples_per_second": 0.781, + "eval_steps_per_second": 0.781, + "step": 6000 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019017509125781104, + "loss": 0.9302, + "step": 6020 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018980387304337064, + "loss": 0.9375, + "step": 6040 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018943265482893027, + "loss": 0.9265, + "step": 6060 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018906143661448987, + "loss": 0.9421, + "step": 6080 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018869021840004947, + "loss": 0.9382, + "step": 6100 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883190001856091, + "loss": 0.9376, + "step": 6120 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001879477819711687, + "loss": 0.9315, + "step": 6140 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018757656375672832, + "loss": 0.9326, + "step": 6160 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018720534554228792, + "loss": 0.9283, + "step": 6180 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018683412732784755, + "loss": 0.9252, + "step": 6200 + }, + { + "epoch": 1.14, + "eval_loss": 0.8812684416770935, + "eval_runtime": 1.2442, + "eval_samples_per_second": 0.804, + "eval_steps_per_second": 0.804, + "step": 6200 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018646290911340715, + "loss": 0.929, + "step": 6220 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018609169089896674, + "loss": 0.9303, + "step": 6240 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018572047268452637, + "loss": 0.9288, + "step": 6260 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018534925447008597, + "loss": 0.9308, + "step": 6280 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001849780362556456, + "loss": 0.9247, + "step": 6300 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018460681804120522, + "loss": 0.939, + "step": 6320 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001842355998267648, + "loss": 0.927, + "step": 6340 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018386438161232442, + "loss": 0.9227, + "step": 6360 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018349316339788402, + "loss": 0.9275, + "step": 6380 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018312194518344365, + "loss": 0.9377, + "step": 6400 + }, + { + "epoch": 1.18, + "eval_loss": 0.9376851916313171, + "eval_runtime": 1.3094, + "eval_samples_per_second": 0.764, + "eval_steps_per_second": 0.764, + "step": 6400 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018275072696900328, + "loss": 0.9386, + "step": 6420 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018237950875456287, + "loss": 0.9389, + "step": 6440 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200829054012247, + "loss": 0.9323, + "step": 6460 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018163707232568207, + "loss": 0.9304, + "step": 6480 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001812658541112417, + "loss": 0.9382, + "step": 6500 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018089463589680133, + "loss": 0.9344, + "step": 6520 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018052341768236093, + "loss": 0.9382, + "step": 6540 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018015219946792055, + "loss": 0.9298, + "step": 6560 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017978098125348013, + "loss": 0.9246, + "step": 6580 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017940976303903975, + "loss": 0.9349, + "step": 6600 + }, + { + "epoch": 1.22, + "eval_loss": 0.8311364650726318, + "eval_runtime": 1.2953, + "eval_samples_per_second": 0.772, + "eval_steps_per_second": 0.772, + "step": 6600 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017903854482459938, + "loss": 0.9309, + "step": 6620 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017866732661015898, + "loss": 0.9311, + "step": 6640 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001782961083957186, + "loss": 0.938, + "step": 6660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017792489018127823, + "loss": 0.9324, + "step": 6680 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017755367196683783, + "loss": 0.9128, + "step": 6700 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017718245375239743, + "loss": 0.9407, + "step": 6720 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017681123553795703, + "loss": 0.9286, + "step": 6740 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017644001732351666, + "loss": 0.9298, + "step": 6760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017606879910907628, + "loss": 0.9329, + "step": 6780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017569758089463588, + "loss": 0.9331, + "step": 6800 + }, + { + "epoch": 1.25, + "eval_loss": 0.8751674890518188, + "eval_runtime": 1.3065, + "eval_samples_per_second": 0.765, + "eval_steps_per_second": 0.765, + "step": 6800 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001753263626801955, + "loss": 0.9264, + "step": 6820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017495514446575508, + "loss": 0.9345, + "step": 6840 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001745839262513147, + "loss": 0.925, + "step": 6860 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017421270803687433, + "loss": 0.9227, + "step": 6880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017384148982243393, + "loss": 0.9237, + "step": 6900 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017347027160799356, + "loss": 0.9365, + "step": 6920 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001730990533935532, + "loss": 0.9356, + "step": 6940 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017272783517911276, + "loss": 0.9242, + "step": 6960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235661696467239, + "loss": 0.9297, + "step": 6980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017198539875023199, + "loss": 0.9232, + "step": 7000 + }, + { + "epoch": 1.29, + "eval_loss": 0.8359224796295166, + "eval_runtime": 1.2767, + "eval_samples_per_second": 0.783, + "eval_steps_per_second": 0.783, + "step": 7000 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001716141805357916, + "loss": 0.9324, + "step": 7020 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017124296232135124, + "loss": 0.9222, + "step": 7040 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017087174410691084, + "loss": 0.9214, + "step": 7060 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017050052589247044, + "loss": 0.9214, + "step": 7080 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017012930767803004, + "loss": 0.931, + "step": 7100 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016975808946358966, + "loss": 0.9218, + "step": 7120 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001693868712491493, + "loss": 0.9136, + "step": 7140 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001690156530347089, + "loss": 0.9284, + "step": 7160 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016864443482026852, + "loss": 0.921, + "step": 7180 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001682732166058281, + "loss": 0.9245, + "step": 7200 + }, + { + "epoch": 1.33, + "eval_loss": 0.8561541438102722, + "eval_runtime": 1.3583, + "eval_samples_per_second": 0.736, + "eval_steps_per_second": 0.736, + "step": 7200 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016790199839138771, + "loss": 0.9259, + "step": 7220 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016753078017694734, + "loss": 0.9238, + "step": 7240 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016715956196250694, + "loss": 0.9237, + "step": 7260 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016678834374806657, + "loss": 0.9111, + "step": 7280 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001664171255336262, + "loss": 0.9257, + "step": 7300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016604590731918577, + "loss": 0.9128, + "step": 7320 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001656746891047454, + "loss": 0.9219, + "step": 7340 + }, + { + "epoch": 1.36, + "learning_rate": 0.000165303470890305, + "loss": 0.9163, + "step": 7360 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016493225267586462, + "loss": 0.9195, + "step": 7380 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016456103446142425, + "loss": 0.9283, + "step": 7400 + }, + { + "epoch": 1.36, + "eval_loss": 0.8775261044502258, + "eval_runtime": 1.3808, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.724, + "step": 7400 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016418981624698385, + "loss": 0.9177, + "step": 7420 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016381859803254344, + "loss": 0.9113, + "step": 7440 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016344737981810304, + "loss": 0.9189, + "step": 7460 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016307616160366267, + "loss": 0.9096, + "step": 7480 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001627049433892223, + "loss": 0.9191, + "step": 7500 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001623337251747819, + "loss": 0.9107, + "step": 7520 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016196250696034152, + "loss": 0.9173, + "step": 7540 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001615912887459011, + "loss": 0.9187, + "step": 7560 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016122007053146072, + "loss": 0.9187, + "step": 7580 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016084885231702035, + "loss": 0.9211, + "step": 7600 + }, + { + "epoch": 1.4, + "eval_loss": 0.8760462999343872, + "eval_runtime": 1.4851, + "eval_samples_per_second": 0.673, + "eval_steps_per_second": 0.673, + "step": 7600 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016047763410257995, + "loss": 0.9116, + "step": 7620 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016010641588813957, + "loss": 0.9117, + "step": 7640 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001597351976736992, + "loss": 0.9184, + "step": 7660 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015936397945925877, + "loss": 0.9122, + "step": 7680 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001589927612448184, + "loss": 0.9155, + "step": 7700 + }, + { + "epoch": 1.42, + "learning_rate": 0.000158621543030378, + "loss": 0.9186, + "step": 7720 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015825032481593763, + "loss": 0.9144, + "step": 7740 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015787910660149725, + "loss": 0.9224, + "step": 7760 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015750788838705685, + "loss": 0.9192, + "step": 7780 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015713667017261645, + "loss": 0.9106, + "step": 7800 + }, + { + "epoch": 1.44, + "eval_loss": 0.8746755719184875, + "eval_runtime": 1.3658, + "eval_samples_per_second": 0.732, + "eval_steps_per_second": 0.732, + "step": 7800 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015676545195817605, + "loss": 0.9106, + "step": 7820 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015639423374373568, + "loss": 0.9024, + "step": 7840 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001560230155292953, + "loss": 0.9055, + "step": 7860 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001556517973148549, + "loss": 0.9106, + "step": 7880 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015528057910041453, + "loss": 0.9219, + "step": 7900 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001549093608859741, + "loss": 0.9015, + "step": 7920 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015453814267153373, + "loss": 0.9029, + "step": 7940 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015416692445709336, + "loss": 0.9095, + "step": 7960 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015379570624265296, + "loss": 0.9284, + "step": 7980 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015342448802821258, + "loss": 0.9047, + "step": 8000 + }, + { + "epoch": 1.48, + "eval_loss": 0.8788416981697083, + "eval_runtime": 1.3646, + "eval_samples_per_second": 0.733, + "eval_steps_per_second": 0.733, + "step": 8000 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015305326981377218, + "loss": 0.9151, + "step": 8020 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015268205159933178, + "loss": 0.9076, + "step": 8040 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001523108333848914, + "loss": 0.9134, + "step": 8060 + }, + { + "epoch": 1.49, + "learning_rate": 0.000151939615170451, + "loss": 0.9136, + "step": 8080 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015156839695601063, + "loss": 0.9073, + "step": 8100 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015119717874157023, + "loss": 0.9054, + "step": 8120 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015082596052712986, + "loss": 0.9169, + "step": 8140 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015045474231268946, + "loss": 0.9249, + "step": 8160 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015008352409824906, + "loss": 0.9032, + "step": 8180 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014971230588380869, + "loss": 0.9089, + "step": 8200 + }, + { + "epoch": 1.51, + "eval_loss": 0.8750318288803101, + "eval_runtime": 1.361, + "eval_samples_per_second": 0.735, + "eval_steps_per_second": 0.735, + "step": 8200 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014934108766936828, + "loss": 0.903, + "step": 8220 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001489698694549279, + "loss": 0.9045, + "step": 8240 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001485986512404875, + "loss": 0.9146, + "step": 8260 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014822743302604714, + "loss": 0.9085, + "step": 8280 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014785621481160674, + "loss": 0.9105, + "step": 8300 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014748499659716634, + "loss": 0.9034, + "step": 8320 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014711377838272596, + "loss": 0.906, + "step": 8340 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001467425601682856, + "loss": 0.9055, + "step": 8360 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001463713419538452, + "loss": 0.9013, + "step": 8380 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001460001237394048, + "loss": 0.9136, + "step": 8400 + }, + { + "epoch": 1.55, + "eval_loss": 0.8716868162155151, + "eval_runtime": 1.3893, + "eval_samples_per_second": 0.72, + "eval_steps_per_second": 0.72, + "step": 8400 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014562890552496441, + "loss": 0.8991, + "step": 8420 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014525768731052401, + "loss": 0.9072, + "step": 8440 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014488646909608364, + "loss": 0.9185, + "step": 8460 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014451525088164324, + "loss": 0.9097, + "step": 8480 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014414403266720284, + "loss": 0.9044, + "step": 8500 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014377281445276247, + "loss": 0.9095, + "step": 8520 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001434015962383221, + "loss": 0.9119, + "step": 8540 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430303780238817, + "loss": 0.8971, + "step": 8560 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426591598094413, + "loss": 0.9011, + "step": 8580 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014228794159500092, + "loss": 0.9048, + "step": 8600 + }, + { + "epoch": 1.59, + "eval_loss": 0.8536492586135864, + "eval_runtime": 1.2569, + "eval_samples_per_second": 0.796, + "eval_steps_per_second": 0.796, + "step": 8600 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014191672338056052, + "loss": 0.9031, + "step": 8620 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014154550516612014, + "loss": 0.9078, + "step": 8640 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014117428695167974, + "loss": 0.9115, + "step": 8660 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014080306873723934, + "loss": 0.9018, + "step": 8680 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014043185052279897, + "loss": 0.8974, + "step": 8700 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001400606323083586, + "loss": 0.9084, + "step": 8720 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001396894140939182, + "loss": 0.9041, + "step": 8740 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001393181958794778, + "loss": 0.8967, + "step": 8760 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013894697766503742, + "loss": 0.8969, + "step": 8780 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013857575945059702, + "loss": 0.8993, + "step": 8800 + }, + { + "epoch": 1.62, + "eval_loss": 0.8307236433029175, + "eval_runtime": 1.2771, + "eval_samples_per_second": 0.783, + "eval_steps_per_second": 0.783, + "step": 8800 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013820454123615665, + "loss": 0.9031, + "step": 8820 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013783332302171625, + "loss": 0.9008, + "step": 8840 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013746210480727585, + "loss": 0.9039, + "step": 8860 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013709088659283547, + "loss": 0.9063, + "step": 8880 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001367196683783951, + "loss": 0.901, + "step": 8900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001363484501639547, + "loss": 0.8973, + "step": 8920 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001359772319495143, + "loss": 0.9057, + "step": 8940 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013560601373507393, + "loss": 0.898, + "step": 8960 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013523479552063353, + "loss": 0.9027, + "step": 8980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013486357730619315, + "loss": 0.9084, + "step": 9000 + }, + { + "epoch": 1.66, + "eval_loss": 0.8812193274497986, + "eval_runtime": 1.2199, + "eval_samples_per_second": 0.82, + "eval_steps_per_second": 0.82, + "step": 9000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013449235909175275, + "loss": 0.9043, + "step": 9020 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013412114087731235, + "loss": 0.9021, + "step": 9040 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013374992266287198, + "loss": 0.8951, + "step": 9060 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001333787044484316, + "loss": 0.8865, + "step": 9080 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001330074862339912, + "loss": 0.9027, + "step": 9100 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001326362680195508, + "loss": 0.8976, + "step": 9120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013226504980511043, + "loss": 0.8911, + "step": 9140 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013189383159067003, + "loss": 0.8973, + "step": 9160 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013152261337622966, + "loss": 0.9076, + "step": 9180 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013115139516178925, + "loss": 0.899, + "step": 9200 + }, + { + "epoch": 1.7, + "eval_loss": 0.82787024974823, + "eval_runtime": 1.2643, + "eval_samples_per_second": 0.791, + "eval_steps_per_second": 0.791, + "step": 9200 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013078017694734885, + "loss": 0.9022, + "step": 9220 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013040895873290848, + "loss": 0.8969, + "step": 9240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001300377405184681, + "loss": 0.895, + "step": 9260 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296665223040277, + "loss": 0.9034, + "step": 9280 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001292953040895873, + "loss": 0.8892, + "step": 9300 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012892408587514693, + "loss": 0.8956, + "step": 9320 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012855286766070656, + "loss": 0.8952, + "step": 9340 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012818164944626616, + "loss": 0.89, + "step": 9360 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012781043123182576, + "loss": 0.9029, + "step": 9380 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012743921301738539, + "loss": 0.8954, + "step": 9400 + }, + { + "epoch": 1.73, + "eval_loss": 0.8729041218757629, + "eval_runtime": 1.2221, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.818, + "step": 9400 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706799480294498, + "loss": 0.8805, + "step": 9420 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266967765885046, + "loss": 0.8936, + "step": 9440 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001263255583740642, + "loss": 0.8893, + "step": 9460 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001259543401596238, + "loss": 0.8953, + "step": 9480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012558312194518344, + "loss": 0.8943, + "step": 9500 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012521190373074306, + "loss": 0.8766, + "step": 9520 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012484068551630266, + "loss": 0.899, + "step": 9540 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012446946730186226, + "loss": 0.8949, + "step": 9560 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001240982490874219, + "loss": 0.8849, + "step": 9580 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001237270308729815, + "loss": 0.894, + "step": 9600 + }, + { + "epoch": 1.77, + "eval_loss": 0.8376001715660095, + "eval_runtime": 1.3277, + "eval_samples_per_second": 0.753, + "eval_steps_per_second": 0.753, + "step": 9600 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012335581265854111, + "loss": 0.9029, + "step": 9620 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012298459444410071, + "loss": 0.898, + "step": 9640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012261337622966031, + "loss": 0.8848, + "step": 9660 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012224215801521994, + "loss": 0.8932, + "step": 9680 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012187093980077955, + "loss": 0.891, + "step": 9700 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012149972158633915, + "loss": 0.8936, + "step": 9720 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012112850337189877, + "loss": 0.8907, + "step": 9740 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012075728515745839, + "loss": 0.8958, + "step": 9760 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012038606694301799, + "loss": 0.8912, + "step": 9780 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001200148487285776, + "loss": 0.9032, + "step": 9800 + }, + { + "epoch": 1.81, + "eval_loss": 0.8445655703544617, + "eval_runtime": 1.3427, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 9800 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011964363051413722, + "loss": 0.8918, + "step": 9820 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011927241229969682, + "loss": 0.8865, + "step": 9840 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011890119408525644, + "loss": 0.8893, + "step": 9860 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011852997587081606, + "loss": 0.8863, + "step": 9880 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011815875765637566, + "loss": 0.8978, + "step": 9900 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011778753944193527, + "loss": 0.886, + "step": 9920 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001174163212274949, + "loss": 0.8789, + "step": 9940 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001170451030130545, + "loss": 0.8944, + "step": 9960 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011667388479861411, + "loss": 0.8964, + "step": 9980 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011630266658417372, + "loss": 0.8921, + "step": 10000 + }, + { + "epoch": 1.84, + "eval_loss": 0.8549759387969971, + "eval_runtime": 1.2407, + "eval_samples_per_second": 0.806, + "eval_steps_per_second": 0.806, + "step": 10000 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011593144836973332, + "loss": 0.8918, + "step": 10020 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011556023015529295, + "loss": 0.8966, + "step": 10040 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011518901194085256, + "loss": 0.8919, + "step": 10060 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011481779372641216, + "loss": 0.8963, + "step": 10080 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011444657551197177, + "loss": 0.8805, + "step": 10100 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140753572975314, + "loss": 0.8837, + "step": 10120 + }, + { + "epoch": 1.87, + "learning_rate": 0.000113704139083091, + "loss": 0.8919, + "step": 10140 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011333292086865061, + "loss": 0.8904, + "step": 10160 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011296170265421023, + "loss": 0.8765, + "step": 10180 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259048443976982, + "loss": 0.8846, + "step": 10200 + }, + { + "epoch": 1.88, + "eval_loss": 0.8317739963531494, + "eval_runtime": 1.3125, + "eval_samples_per_second": 0.762, + "eval_steps_per_second": 0.762, + "step": 10200 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011221926622532945, + "loss": 0.8754, + "step": 10220 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011184804801088906, + "loss": 0.8824, + "step": 10240 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011147682979644866, + "loss": 0.888, + "step": 10260 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011110561158200828, + "loss": 0.8986, + "step": 10280 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001107343933675679, + "loss": 0.8764, + "step": 10300 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001103631751531275, + "loss": 0.8901, + "step": 10320 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010999195693868712, + "loss": 0.8859, + "step": 10340 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010962073872424673, + "loss": 0.8868, + "step": 10360 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010924952050980633, + "loss": 0.8827, + "step": 10380 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010887830229536595, + "loss": 0.8911, + "step": 10400 + }, + { + "epoch": 1.92, + "eval_loss": 0.8604958057403564, + "eval_runtime": 1.2674, + "eval_samples_per_second": 0.789, + "eval_steps_per_second": 0.789, + "step": 10400 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010850708408092557, + "loss": 0.8888, + "step": 10420 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010813586586648517, + "loss": 0.8878, + "step": 10440 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010776464765204478, + "loss": 0.8853, + "step": 10460 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001073934294376044, + "loss": 0.8831, + "step": 10480 + }, + { + "epoch": 1.94, + "learning_rate": 0.000107022211223164, + "loss": 0.8906, + "step": 10500 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665099300872362, + "loss": 0.8869, + "step": 10520 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010627977479428323, + "loss": 0.882, + "step": 10540 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010590855657984283, + "loss": 0.8789, + "step": 10560 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010553733836540246, + "loss": 0.8819, + "step": 10580 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010516612015096207, + "loss": 0.8765, + "step": 10600 + }, + { + "epoch": 1.95, + "eval_loss": 0.8246310949325562, + "eval_runtime": 1.3918, + "eval_samples_per_second": 0.719, + "eval_steps_per_second": 0.719, + "step": 10600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010479490193652167, + "loss": 0.8754, + "step": 10620 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010442368372208128, + "loss": 0.8876, + "step": 10640 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001040524655076409, + "loss": 0.8871, + "step": 10660 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010368124729320051, + "loss": 0.879, + "step": 10680 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010331002907876012, + "loss": 0.8809, + "step": 10700 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010293881086431974, + "loss": 0.8847, + "step": 10720 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010256759264987934, + "loss": 0.8861, + "step": 10740 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010219637443543895, + "loss": 0.8758, + "step": 10760 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010182515622099858, + "loss": 0.8781, + "step": 10780 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010145393800655817, + "loss": 0.8726, + "step": 10800 + }, + { + "epoch": 1.99, + "eval_loss": 0.846712052822113, + "eval_runtime": 1.3944, + "eval_samples_per_second": 0.717, + "eval_steps_per_second": 0.717, + "step": 10800 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010108271979211779, + "loss": 0.8774, + "step": 10820 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001007115015776774, + "loss": 0.8755, + "step": 10840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000100340283363237, + "loss": 0.8814, + "step": 10860 + }, + { + "epoch": 2.01, + "learning_rate": 9.996906514879663e-05, + "loss": 0.8694, + "step": 10880 + }, + { + "epoch": 2.01, + "learning_rate": 9.959784693435624e-05, + "loss": 0.8808, + "step": 10900 + }, + { + "epoch": 2.01, + "learning_rate": 9.922662871991584e-05, + "loss": 0.8824, + "step": 10920 + }, + { + "epoch": 2.02, + "learning_rate": 9.885541050547545e-05, + "loss": 0.8669, + "step": 10940 + }, + { + "epoch": 2.02, + "learning_rate": 9.848419229103508e-05, + "loss": 0.8702, + "step": 10960 + }, + { + "epoch": 2.03, + "learning_rate": 9.811297407659468e-05, + "loss": 0.8722, + "step": 10980 + }, + { + "epoch": 2.03, + "learning_rate": 9.774175586215429e-05, + "loss": 0.8844, + "step": 11000 + }, + { + "epoch": 2.03, + "eval_loss": 0.8575500845909119, + "eval_runtime": 1.622, + "eval_samples_per_second": 0.617, + "eval_steps_per_second": 0.617, + "step": 11000 + }, + { + "epoch": 2.03, + "learning_rate": 9.73705376477139e-05, + "loss": 0.877, + "step": 11020 + }, + { + "epoch": 2.04, + "learning_rate": 9.699931943327353e-05, + "loss": 0.8635, + "step": 11040 + }, + { + "epoch": 2.04, + "learning_rate": 9.662810121883313e-05, + "loss": 0.8729, + "step": 11060 + }, + { + "epoch": 2.04, + "learning_rate": 9.625688300439274e-05, + "loss": 0.865, + "step": 11080 + }, + { + "epoch": 2.05, + "learning_rate": 9.588566478995236e-05, + "loss": 0.8811, + "step": 11100 + }, + { + "epoch": 2.05, + "learning_rate": 9.551444657551196e-05, + "loss": 0.8729, + "step": 11120 + }, + { + "epoch": 2.05, + "learning_rate": 9.514322836107158e-05, + "loss": 0.8865, + "step": 11140 + }, + { + "epoch": 2.06, + "learning_rate": 9.47720101466312e-05, + "loss": 0.8747, + "step": 11160 + }, + { + "epoch": 2.06, + "learning_rate": 9.44007919321908e-05, + "loss": 0.8779, + "step": 11180 + }, + { + "epoch": 2.07, + "learning_rate": 9.402957371775041e-05, + "loss": 0.8764, + "step": 11200 + }, + { + "epoch": 2.07, + "eval_loss": 0.8142430186271667, + "eval_runtime": 1.4953, + "eval_samples_per_second": 0.669, + "eval_steps_per_second": 0.669, + "step": 11200 + }, + { + "epoch": 2.07, + "learning_rate": 9.365835550331003e-05, + "loss": 0.8779, + "step": 11220 + }, + { + "epoch": 2.07, + "learning_rate": 9.328713728886963e-05, + "loss": 0.8702, + "step": 11240 + }, + { + "epoch": 2.08, + "learning_rate": 9.291591907442925e-05, + "loss": 0.8832, + "step": 11260 + }, + { + "epoch": 2.08, + "learning_rate": 9.254470085998886e-05, + "loss": 0.8718, + "step": 11280 + }, + { + "epoch": 2.08, + "learning_rate": 9.217348264554846e-05, + "loss": 0.8726, + "step": 11300 + }, + { + "epoch": 2.09, + "learning_rate": 9.180226443110809e-05, + "loss": 0.8677, + "step": 11320 + }, + { + "epoch": 2.09, + "learning_rate": 9.14310462166677e-05, + "loss": 0.8748, + "step": 11340 + }, + { + "epoch": 2.1, + "learning_rate": 9.10598280022273e-05, + "loss": 0.8694, + "step": 11360 + }, + { + "epoch": 2.1, + "learning_rate": 9.068860978778691e-05, + "loss": 0.8613, + "step": 11380 + }, + { + "epoch": 2.1, + "learning_rate": 9.031739157334652e-05, + "loss": 0.8687, + "step": 11400 + }, + { + "epoch": 2.1, + "eval_loss": 0.827206552028656, + "eval_runtime": 1.6259, + "eval_samples_per_second": 0.615, + "eval_steps_per_second": 0.615, + "step": 11400 + }, + { + "epoch": 2.11, + "learning_rate": 8.994617335890614e-05, + "loss": 0.8735, + "step": 11420 + }, + { + "epoch": 2.11, + "learning_rate": 8.957495514446575e-05, + "loss": 0.8691, + "step": 11440 + }, + { + "epoch": 2.11, + "learning_rate": 8.920373693002536e-05, + "loss": 0.8605, + "step": 11460 + }, + { + "epoch": 2.12, + "learning_rate": 8.883251871558496e-05, + "loss": 0.8715, + "step": 11480 + }, + { + "epoch": 2.12, + "learning_rate": 8.846130050114459e-05, + "loss": 0.8541, + "step": 11500 + }, + { + "epoch": 2.12, + "learning_rate": 8.80900822867042e-05, + "loss": 0.8668, + "step": 11520 + }, + { + "epoch": 2.13, + "learning_rate": 8.77188640722638e-05, + "loss": 0.8745, + "step": 11540 + }, + { + "epoch": 2.13, + "learning_rate": 8.734764585782342e-05, + "loss": 0.8781, + "step": 11560 + }, + { + "epoch": 2.14, + "learning_rate": 8.697642764338303e-05, + "loss": 0.8782, + "step": 11580 + }, + { + "epoch": 2.14, + "learning_rate": 8.660520942894264e-05, + "loss": 0.8733, + "step": 11600 + }, + { + "epoch": 2.14, + "eval_loss": 0.8250707983970642, + "eval_runtime": 1.5297, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.654, + "step": 11600 + }, + { + "epoch": 2.14, + "learning_rate": 8.623399121450225e-05, + "loss": 0.8771, + "step": 11620 + }, + { + "epoch": 2.15, + "learning_rate": 8.586277300006187e-05, + "loss": 0.8809, + "step": 11640 + }, + { + "epoch": 2.15, + "learning_rate": 8.549155478562147e-05, + "loss": 0.8663, + "step": 11660 + }, + { + "epoch": 2.15, + "learning_rate": 8.512033657118108e-05, + "loss": 0.8565, + "step": 11680 + }, + { + "epoch": 2.16, + "learning_rate": 8.47491183567407e-05, + "loss": 0.8688, + "step": 11700 + }, + { + "epoch": 2.16, + "learning_rate": 8.43779001423003e-05, + "loss": 0.8712, + "step": 11720 + }, + { + "epoch": 2.17, + "learning_rate": 8.400668192785992e-05, + "loss": 0.8692, + "step": 11740 + }, + { + "epoch": 2.17, + "learning_rate": 8.363546371341953e-05, + "loss": 0.8639, + "step": 11760 + }, + { + "epoch": 2.17, + "learning_rate": 8.326424549897913e-05, + "loss": 0.8598, + "step": 11780 + }, + { + "epoch": 2.18, + "learning_rate": 8.289302728453876e-05, + "loss": 0.8675, + "step": 11800 + }, + { + "epoch": 2.18, + "eval_loss": 0.8465573191642761, + "eval_runtime": 1.6002, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.625, + "step": 11800 + }, + { + "epoch": 2.18, + "learning_rate": 8.252180907009837e-05, + "loss": 0.8667, + "step": 11820 + }, + { + "epoch": 2.18, + "learning_rate": 8.215059085565797e-05, + "loss": 0.8696, + "step": 11840 + }, + { + "epoch": 2.19, + "learning_rate": 8.177937264121758e-05, + "loss": 0.8691, + "step": 11860 + }, + { + "epoch": 2.19, + "learning_rate": 8.140815442677721e-05, + "loss": 0.8649, + "step": 11880 + }, + { + "epoch": 2.19, + "learning_rate": 8.103693621233681e-05, + "loss": 0.8605, + "step": 11900 + }, + { + "epoch": 2.2, + "learning_rate": 8.066571799789642e-05, + "loss": 0.8698, + "step": 11920 + }, + { + "epoch": 2.2, + "learning_rate": 8.029449978345604e-05, + "loss": 0.8595, + "step": 11940 + }, + { + "epoch": 2.21, + "learning_rate": 7.992328156901564e-05, + "loss": 0.867, + "step": 11960 + }, + { + "epoch": 2.21, + "learning_rate": 7.955206335457526e-05, + "loss": 0.8662, + "step": 11980 + }, + { + "epoch": 2.21, + "learning_rate": 7.918084514013487e-05, + "loss": 0.8747, + "step": 12000 + }, + { + "epoch": 2.21, + "eval_loss": 0.8371507525444031, + "eval_runtime": 1.5617, + "eval_samples_per_second": 0.64, + "eval_steps_per_second": 0.64, + "step": 12000 + }, + { + "epoch": 2.22, + "learning_rate": 7.880962692569447e-05, + "loss": 0.8635, + "step": 12020 + }, + { + "epoch": 2.22, + "learning_rate": 7.843840871125409e-05, + "loss": 0.8648, + "step": 12040 + }, + { + "epoch": 2.22, + "learning_rate": 7.806719049681371e-05, + "loss": 0.8653, + "step": 12060 + }, + { + "epoch": 2.23, + "learning_rate": 7.769597228237331e-05, + "loss": 0.8619, + "step": 12080 + }, + { + "epoch": 2.23, + "learning_rate": 7.732475406793293e-05, + "loss": 0.8616, + "step": 12100 + }, + { + "epoch": 2.24, + "learning_rate": 7.695353585349254e-05, + "loss": 0.8593, + "step": 12120 + }, + { + "epoch": 2.24, + "learning_rate": 7.658231763905214e-05, + "loss": 0.8683, + "step": 12140 + }, + { + "epoch": 2.24, + "learning_rate": 7.621109942461177e-05, + "loss": 0.8641, + "step": 12160 + }, + { + "epoch": 2.25, + "learning_rate": 7.583988121017138e-05, + "loss": 0.8596, + "step": 12180 + }, + { + "epoch": 2.25, + "learning_rate": 7.546866299573098e-05, + "loss": 0.8626, + "step": 12200 + }, + { + "epoch": 2.25, + "eval_loss": 0.8443911075592041, + "eval_runtime": 1.578, + "eval_samples_per_second": 0.634, + "eval_steps_per_second": 0.634, + "step": 12200 + }, + { + "epoch": 2.25, + "learning_rate": 7.509744478129059e-05, + "loss": 0.8568, + "step": 12220 + }, + { + "epoch": 2.26, + "learning_rate": 7.47262265668502e-05, + "loss": 0.8688, + "step": 12240 + }, + { + "epoch": 2.26, + "learning_rate": 7.435500835240982e-05, + "loss": 0.8667, + "step": 12260 + }, + { + "epoch": 2.26, + "learning_rate": 7.398379013796943e-05, + "loss": 0.8638, + "step": 12280 + }, + { + "epoch": 2.27, + "learning_rate": 7.361257192352904e-05, + "loss": 0.8633, + "step": 12300 + }, + { + "epoch": 2.27, + "learning_rate": 7.324135370908866e-05, + "loss": 0.8716, + "step": 12320 + }, + { + "epoch": 2.28, + "learning_rate": 7.287013549464827e-05, + "loss": 0.862, + "step": 12340 + }, + { + "epoch": 2.28, + "learning_rate": 7.249891728020788e-05, + "loss": 0.8673, + "step": 12360 + }, + { + "epoch": 2.28, + "learning_rate": 7.21276990657675e-05, + "loss": 0.8518, + "step": 12380 + }, + { + "epoch": 2.29, + "learning_rate": 7.17564808513271e-05, + "loss": 0.8573, + "step": 12400 + }, + { + "epoch": 2.29, + "eval_loss": 0.8218865394592285, + "eval_runtime": 1.5777, + "eval_samples_per_second": 0.634, + "eval_steps_per_second": 0.634, + "step": 12400 + }, + { + "epoch": 2.29, + "learning_rate": 7.138526263688672e-05, + "loss": 0.8647, + "step": 12420 + }, + { + "epoch": 2.29, + "learning_rate": 7.101404442244632e-05, + "loss": 0.8584, + "step": 12440 + }, + { + "epoch": 2.3, + "learning_rate": 7.064282620800593e-05, + "loss": 0.8642, + "step": 12460 + }, + { + "epoch": 2.3, + "learning_rate": 7.027160799356555e-05, + "loss": 0.8612, + "step": 12480 + }, + { + "epoch": 2.31, + "learning_rate": 6.990038977912516e-05, + "loss": 0.8594, + "step": 12500 + }, + { + "epoch": 2.31, + "learning_rate": 6.952917156468477e-05, + "loss": 0.856, + "step": 12520 + }, + { + "epoch": 2.31, + "learning_rate": 6.915795335024439e-05, + "loss": 0.8527, + "step": 12540 + }, + { + "epoch": 2.32, + "learning_rate": 6.878673513580399e-05, + "loss": 0.8721, + "step": 12560 + }, + { + "epoch": 2.32, + "learning_rate": 6.84155169213636e-05, + "loss": 0.8555, + "step": 12580 + }, + { + "epoch": 2.32, + "learning_rate": 6.804429870692321e-05, + "loss": 0.8666, + "step": 12600 + }, + { + "epoch": 2.32, + "eval_loss": 0.8119338750839233, + "eval_runtime": 1.4548, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 12600 + }, + { + "epoch": 2.33, + "learning_rate": 6.767308049248282e-05, + "loss": 0.8531, + "step": 12620 + }, + { + "epoch": 2.33, + "learning_rate": 6.730186227804244e-05, + "loss": 0.8558, + "step": 12640 + }, + { + "epoch": 2.33, + "learning_rate": 6.693064406360205e-05, + "loss": 0.8634, + "step": 12660 + }, + { + "epoch": 2.34, + "learning_rate": 6.655942584916166e-05, + "loss": 0.862, + "step": 12680 + }, + { + "epoch": 2.34, + "learning_rate": 6.618820763472126e-05, + "loss": 0.8527, + "step": 12700 + }, + { + "epoch": 2.35, + "learning_rate": 6.581698942028089e-05, + "loss": 0.8564, + "step": 12720 + }, + { + "epoch": 2.35, + "learning_rate": 6.544577120584049e-05, + "loss": 0.8632, + "step": 12740 + }, + { + "epoch": 2.35, + "learning_rate": 6.50745529914001e-05, + "loss": 0.8497, + "step": 12760 + }, + { + "epoch": 2.36, + "learning_rate": 6.470333477695971e-05, + "loss": 0.8684, + "step": 12780 + }, + { + "epoch": 2.36, + "learning_rate": 6.433211656251933e-05, + "loss": 0.8625, + "step": 12800 + }, + { + "epoch": 2.36, + "eval_loss": 0.8092614412307739, + "eval_runtime": 1.351, + "eval_samples_per_second": 0.74, + "eval_steps_per_second": 0.74, + "step": 12800 + }, + { + "epoch": 2.36, + "learning_rate": 6.396089834807894e-05, + "loss": 0.8472, + "step": 12820 + }, + { + "epoch": 2.37, + "learning_rate": 6.358968013363855e-05, + "loss": 0.8463, + "step": 12840 + }, + { + "epoch": 2.37, + "learning_rate": 6.321846191919817e-05, + "loss": 0.8503, + "step": 12860 + }, + { + "epoch": 2.38, + "learning_rate": 6.284724370475777e-05, + "loss": 0.8501, + "step": 12880 + }, + { + "epoch": 2.38, + "learning_rate": 6.247602549031739e-05, + "loss": 0.8577, + "step": 12900 + }, + { + "epoch": 2.38, + "learning_rate": 6.210480727587699e-05, + "loss": 0.8644, + "step": 12920 + }, + { + "epoch": 2.39, + "learning_rate": 6.17335890614366e-05, + "loss": 0.8491, + "step": 12940 + }, + { + "epoch": 2.39, + "learning_rate": 6.136237084699622e-05, + "loss": 0.8589, + "step": 12960 + }, + { + "epoch": 2.39, + "learning_rate": 6.099115263255583e-05, + "loss": 0.8558, + "step": 12980 + }, + { + "epoch": 2.4, + "learning_rate": 6.061993441811544e-05, + "loss": 0.8523, + "step": 13000 + }, + { + "epoch": 2.4, + "eval_loss": 0.8120648264884949, + "eval_runtime": 1.35, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.741, + "step": 13000 + }, + { + "epoch": 2.4, + "learning_rate": 6.024871620367506e-05, + "loss": 0.8638, + "step": 13020 + }, + { + "epoch": 2.41, + "learning_rate": 5.9877497989234664e-05, + "loss": 0.8553, + "step": 13040 + }, + { + "epoch": 2.41, + "learning_rate": 5.950627977479428e-05, + "loss": 0.8514, + "step": 13060 + }, + { + "epoch": 2.41, + "learning_rate": 5.913506156035389e-05, + "loss": 0.847, + "step": 13080 + }, + { + "epoch": 2.42, + "learning_rate": 5.87638433459135e-05, + "loss": 0.8626, + "step": 13100 + }, + { + "epoch": 2.42, + "learning_rate": 5.839262513147311e-05, + "loss": 0.8501, + "step": 13120 + }, + { + "epoch": 2.42, + "learning_rate": 5.802140691703273e-05, + "loss": 0.8546, + "step": 13140 + }, + { + "epoch": 2.43, + "learning_rate": 5.7650188702592335e-05, + "loss": 0.85, + "step": 13160 + }, + { + "epoch": 2.43, + "learning_rate": 5.727897048815194e-05, + "loss": 0.8572, + "step": 13180 + }, + { + "epoch": 2.43, + "learning_rate": 5.690775227371156e-05, + "loss": 0.8407, + "step": 13200 + }, + { + "epoch": 2.43, + "eval_loss": 0.8303215503692627, + "eval_runtime": 1.3376, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 13200 + }, + { + "epoch": 2.44, + "learning_rate": 5.653653405927117e-05, + "loss": 0.8494, + "step": 13220 + }, + { + "epoch": 2.44, + "learning_rate": 5.616531584483079e-05, + "loss": 0.8517, + "step": 13240 + }, + { + "epoch": 2.45, + "learning_rate": 5.579409763039039e-05, + "loss": 0.8472, + "step": 13260 + }, + { + "epoch": 2.45, + "learning_rate": 5.542287941595e-05, + "loss": 0.8573, + "step": 13280 + }, + { + "epoch": 2.45, + "learning_rate": 5.505166120150962e-05, + "loss": 0.8492, + "step": 13300 + }, + { + "epoch": 2.46, + "learning_rate": 5.4680442987069226e-05, + "loss": 0.8613, + "step": 13320 + }, + { + "epoch": 2.46, + "learning_rate": 5.430922477262884e-05, + "loss": 0.8469, + "step": 13340 + }, + { + "epoch": 2.46, + "learning_rate": 5.393800655818845e-05, + "loss": 0.854, + "step": 13360 + }, + { + "epoch": 2.47, + "learning_rate": 5.3566788343748065e-05, + "loss": 0.8404, + "step": 13380 + }, + { + "epoch": 2.47, + "learning_rate": 5.319557012930767e-05, + "loss": 0.8415, + "step": 13400 + }, + { + "epoch": 2.47, + "eval_loss": 0.7983837723731995, + "eval_runtime": 1.3234, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 13400 + }, + { + "epoch": 2.48, + "learning_rate": 5.282435191486729e-05, + "loss": 0.8442, + "step": 13420 + }, + { + "epoch": 2.48, + "learning_rate": 5.24531337004269e-05, + "loss": 0.8515, + "step": 13440 + }, + { + "epoch": 2.48, + "learning_rate": 5.20819154859865e-05, + "loss": 0.8472, + "step": 13460 + }, + { + "epoch": 2.49, + "learning_rate": 5.171069727154612e-05, + "loss": 0.8557, + "step": 13480 + }, + { + "epoch": 2.49, + "learning_rate": 5.133947905710573e-05, + "loss": 0.8531, + "step": 13500 + }, + { + "epoch": 2.49, + "learning_rate": 5.096826084266534e-05, + "loss": 0.841, + "step": 13520 + }, + { + "epoch": 2.5, + "learning_rate": 5.0597042628224955e-05, + "loss": 0.8574, + "step": 13540 + }, + { + "epoch": 2.5, + "learning_rate": 5.022582441378457e-05, + "loss": 0.8484, + "step": 13560 + }, + { + "epoch": 2.5, + "learning_rate": 4.9854606199344175e-05, + "loss": 0.8457, + "step": 13580 + }, + { + "epoch": 2.51, + "learning_rate": 4.9483387984903794e-05, + "loss": 0.8457, + "step": 13600 + }, + { + "epoch": 2.51, + "eval_loss": 0.823489248752594, + "eval_runtime": 1.2716, + "eval_samples_per_second": 0.786, + "eval_steps_per_second": 0.786, + "step": 13600 + }, + { + "epoch": 2.51, + "learning_rate": 4.91121697704634e-05, + "loss": 0.8461, + "step": 13620 + }, + { + "epoch": 2.52, + "learning_rate": 4.874095155602301e-05, + "loss": 0.8441, + "step": 13640 + }, + { + "epoch": 2.52, + "learning_rate": 4.836973334158263e-05, + "loss": 0.8401, + "step": 13660 + }, + { + "epoch": 2.52, + "learning_rate": 4.799851512714223e-05, + "loss": 0.838, + "step": 13680 + }, + { + "epoch": 2.53, + "learning_rate": 4.7627296912701846e-05, + "loss": 0.846, + "step": 13700 + }, + { + "epoch": 2.53, + "learning_rate": 4.725607869826146e-05, + "loss": 0.8529, + "step": 13720 + }, + { + "epoch": 2.53, + "learning_rate": 4.6884860483821065e-05, + "loss": 0.8396, + "step": 13740 + }, + { + "epoch": 2.54, + "learning_rate": 4.651364226938068e-05, + "loss": 0.8367, + "step": 13760 + }, + { + "epoch": 2.54, + "learning_rate": 4.614242405494029e-05, + "loss": 0.8469, + "step": 13780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5771205840499904e-05, + "loss": 0.8463, + "step": 13800 + }, + { + "epoch": 2.55, + "eval_loss": 0.8162570595741272, + "eval_runtime": 1.311, + "eval_samples_per_second": 0.763, + "eval_steps_per_second": 0.763, + "step": 13800 + }, + { + "epoch": 2.55, + "learning_rate": 4.539998762605951e-05, + "loss": 0.8409, + "step": 13820 + }, + { + "epoch": 2.55, + "learning_rate": 4.502876941161913e-05, + "loss": 0.8465, + "step": 13840 + }, + { + "epoch": 2.56, + "learning_rate": 4.4657551197178737e-05, + "loss": 0.8468, + "step": 13860 + }, + { + "epoch": 2.56, + "learning_rate": 4.428633298273834e-05, + "loss": 0.8405, + "step": 13880 + }, + { + "epoch": 2.56, + "learning_rate": 4.391511476829796e-05, + "loss": 0.8369, + "step": 13900 + }, + { + "epoch": 2.57, + "learning_rate": 4.354389655385757e-05, + "loss": 0.8488, + "step": 13920 + }, + { + "epoch": 2.57, + "learning_rate": 4.317267833941718e-05, + "loss": 0.8451, + "step": 13940 + }, + { + "epoch": 2.57, + "learning_rate": 4.2801460124976795e-05, + "loss": 0.85, + "step": 13960 + }, + { + "epoch": 2.58, + "learning_rate": 4.243024191053641e-05, + "loss": 0.8414, + "step": 13980 + }, + { + "epoch": 2.58, + "learning_rate": 4.2059023696096014e-05, + "loss": 0.8397, + "step": 14000 + }, + { + "epoch": 2.58, + "eval_loss": 0.8420786261558533, + "eval_runtime": 1.2958, + "eval_samples_per_second": 0.772, + "eval_steps_per_second": 0.772, + "step": 14000 + }, + { + "epoch": 2.59, + "learning_rate": 4.1687805481655634e-05, + "loss": 0.8465, + "step": 14020 + }, + { + "epoch": 2.59, + "learning_rate": 4.131658726721524e-05, + "loss": 0.8441, + "step": 14040 + }, + { + "epoch": 2.59, + "learning_rate": 4.0945369052774846e-05, + "loss": 0.8418, + "step": 14060 + }, + { + "epoch": 2.6, + "learning_rate": 4.0574150838334466e-05, + "loss": 0.8301, + "step": 14080 + }, + { + "epoch": 2.6, + "learning_rate": 4.020293262389407e-05, + "loss": 0.837, + "step": 14100 + }, + { + "epoch": 2.6, + "learning_rate": 3.983171440945369e-05, + "loss": 0.8387, + "step": 14120 + }, + { + "epoch": 2.61, + "learning_rate": 3.94604961950133e-05, + "loss": 0.8525, + "step": 14140 + }, + { + "epoch": 2.61, + "learning_rate": 3.908927798057291e-05, + "loss": 0.8406, + "step": 14160 + }, + { + "epoch": 2.62, + "learning_rate": 3.8718059766132525e-05, + "loss": 0.8351, + "step": 14180 + }, + { + "epoch": 2.62, + "learning_rate": 3.834684155169213e-05, + "loss": 0.8357, + "step": 14200 + }, + { + "epoch": 2.62, + "eval_loss": 0.8169363737106323, + "eval_runtime": 1.3056, + "eval_samples_per_second": 0.766, + "eval_steps_per_second": 0.766, + "step": 14200 + }, + { + "epoch": 2.62, + "learning_rate": 3.7975623337251744e-05, + "loss": 0.8501, + "step": 14220 + }, + { + "epoch": 2.63, + "learning_rate": 3.760440512281136e-05, + "loss": 0.8351, + "step": 14240 + }, + { + "epoch": 2.63, + "learning_rate": 3.723318690837097e-05, + "loss": 0.832, + "step": 14260 + }, + { + "epoch": 2.63, + "learning_rate": 3.6861968693930576e-05, + "loss": 0.8365, + "step": 14280 + }, + { + "epoch": 2.64, + "learning_rate": 3.649075047949019e-05, + "loss": 0.8353, + "step": 14300 + }, + { + "epoch": 2.64, + "learning_rate": 3.61195322650498e-05, + "loss": 0.8344, + "step": 14320 + }, + { + "epoch": 2.64, + "learning_rate": 3.5748314050609415e-05, + "loss": 0.8441, + "step": 14340 + }, + { + "epoch": 2.65, + "learning_rate": 3.537709583616902e-05, + "loss": 0.8413, + "step": 14360 + }, + { + "epoch": 2.65, + "learning_rate": 3.5005877621728634e-05, + "loss": 0.8328, + "step": 14380 + }, + { + "epoch": 2.66, + "learning_rate": 3.463465940728825e-05, + "loss": 0.8473, + "step": 14400 + }, + { + "epoch": 2.66, + "eval_loss": 0.8171071410179138, + "eval_runtime": 1.2752, + "eval_samples_per_second": 0.784, + "eval_steps_per_second": 0.784, + "step": 14400 + }, + { + "epoch": 2.66, + "learning_rate": 3.426344119284786e-05, + "loss": 0.8369, + "step": 14420 + }, + { + "epoch": 2.66, + "learning_rate": 3.3892222978407473e-05, + "loss": 0.8406, + "step": 14440 + }, + { + "epoch": 2.67, + "learning_rate": 3.3521004763967087e-05, + "loss": 0.8292, + "step": 14460 + }, + { + "epoch": 2.67, + "learning_rate": 3.314978654952669e-05, + "loss": 0.8449, + "step": 14480 + }, + { + "epoch": 2.67, + "learning_rate": 3.2778568335086306e-05, + "loss": 0.8399, + "step": 14500 + }, + { + "epoch": 2.68, + "learning_rate": 3.240735012064592e-05, + "loss": 0.835, + "step": 14520 + }, + { + "epoch": 2.68, + "learning_rate": 3.2036131906205525e-05, + "loss": 0.8416, + "step": 14540 + }, + { + "epoch": 2.69, + "learning_rate": 3.166491369176514e-05, + "loss": 0.8369, + "step": 14560 + }, + { + "epoch": 2.69, + "learning_rate": 3.129369547732475e-05, + "loss": 0.8347, + "step": 14580 + }, + { + "epoch": 2.69, + "learning_rate": 3.0922477262884364e-05, + "loss": 0.8392, + "step": 14600 + }, + { + "epoch": 2.69, + "eval_loss": 0.8220583200454712, + "eval_runtime": 1.3692, + "eval_samples_per_second": 0.73, + "eval_steps_per_second": 0.73, + "step": 14600 + }, + { + "epoch": 2.7, + "learning_rate": 3.055125904844398e-05, + "loss": 0.8413, + "step": 14620 + }, + { + "epoch": 2.7, + "learning_rate": 3.0180040834003587e-05, + "loss": 0.8428, + "step": 14640 + }, + { + "epoch": 2.7, + "learning_rate": 2.9808822619563196e-05, + "loss": 0.8362, + "step": 14660 + }, + { + "epoch": 2.71, + "learning_rate": 2.943760440512281e-05, + "loss": 0.8324, + "step": 14680 + }, + { + "epoch": 2.71, + "learning_rate": 2.9066386190682422e-05, + "loss": 0.8363, + "step": 14700 + }, + { + "epoch": 2.71, + "learning_rate": 2.869516797624203e-05, + "loss": 0.841, + "step": 14720 + }, + { + "epoch": 2.72, + "learning_rate": 2.832394976180164e-05, + "loss": 0.8357, + "step": 14740 + }, + { + "epoch": 2.72, + "learning_rate": 2.7952731547361255e-05, + "loss": 0.8309, + "step": 14760 + }, + { + "epoch": 2.73, + "learning_rate": 2.7581513332920868e-05, + "loss": 0.8361, + "step": 14780 + }, + { + "epoch": 2.73, + "learning_rate": 2.7210295118480477e-05, + "loss": 0.8404, + "step": 14800 + }, + { + "epoch": 2.73, + "eval_loss": 0.8303062915802002, + "eval_runtime": 1.4178, + "eval_samples_per_second": 0.705, + "eval_steps_per_second": 0.705, + "step": 14800 + }, + { + "epoch": 2.73, + "learning_rate": 2.683907690404009e-05, + "loss": 0.8418, + "step": 14820 + }, + { + "epoch": 2.74, + "learning_rate": 2.6467858689599703e-05, + "loss": 0.8466, + "step": 14840 + }, + { + "epoch": 2.74, + "learning_rate": 2.609664047515931e-05, + "loss": 0.8288, + "step": 14860 + }, + { + "epoch": 2.74, + "learning_rate": 2.5725422260718923e-05, + "loss": 0.8311, + "step": 14880 + }, + { + "epoch": 2.75, + "learning_rate": 2.5354204046278536e-05, + "loss": 0.8369, + "step": 14900 + }, + { + "epoch": 2.75, + "learning_rate": 2.4982985831838145e-05, + "loss": 0.8275, + "step": 14920 + }, + { + "epoch": 2.76, + "learning_rate": 2.461176761739776e-05, + "loss": 0.8283, + "step": 14940 + }, + { + "epoch": 2.76, + "learning_rate": 2.424054940295737e-05, + "loss": 0.8367, + "step": 14960 + }, + { + "epoch": 2.76, + "learning_rate": 2.386933118851698e-05, + "loss": 0.8247, + "step": 14980 + }, + { + "epoch": 2.77, + "learning_rate": 2.3498112974076594e-05, + "loss": 0.8405, + "step": 15000 + }, + { + "epoch": 2.77, + "eval_loss": 0.828734815120697, + "eval_runtime": 1.3774, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.726, + "step": 15000 + }, + { + "epoch": 2.77, + "learning_rate": 2.3126894759636204e-05, + "loss": 0.822, + "step": 15020 + }, + { + "epoch": 2.77, + "learning_rate": 2.2774237455917836e-05, + "loss": 0.8458, + "step": 15040 + }, + { + "epoch": 2.78, + "learning_rate": 2.240301924147745e-05, + "loss": 0.8282, + "step": 15060 + }, + { + "epoch": 2.78, + "learning_rate": 2.2031801027037055e-05, + "loss": 0.8241, + "step": 15080 + }, + { + "epoch": 2.78, + "learning_rate": 2.166058281259667e-05, + "loss": 0.8322, + "step": 15100 + }, + { + "epoch": 2.79, + "learning_rate": 2.128936459815628e-05, + "loss": 0.8343, + "step": 15120 + }, + { + "epoch": 2.79, + "learning_rate": 2.091814638371589e-05, + "loss": 0.8238, + "step": 15140 + }, + { + "epoch": 2.8, + "learning_rate": 2.0546928169275504e-05, + "loss": 0.8381, + "step": 15160 + }, + { + "epoch": 2.8, + "learning_rate": 2.0175709954835117e-05, + "loss": 0.8301, + "step": 15180 + }, + { + "epoch": 2.8, + "learning_rate": 1.980449174039473e-05, + "loss": 0.8177, + "step": 15200 + }, + { + "epoch": 2.8, + "eval_loss": 0.8177865743637085, + "eval_runtime": 1.3723, + "eval_samples_per_second": 0.729, + "eval_steps_per_second": 0.729, + "step": 15200 + }, + { + "epoch": 2.81, + "learning_rate": 1.9433273525954336e-05, + "loss": 0.8324, + "step": 15220 + }, + { + "epoch": 2.81, + "learning_rate": 1.906205531151395e-05, + "loss": 0.83, + "step": 15240 + }, + { + "epoch": 2.81, + "learning_rate": 1.8690837097073562e-05, + "loss": 0.8283, + "step": 15260 + }, + { + "epoch": 2.82, + "learning_rate": 1.8319618882633172e-05, + "loss": 0.8262, + "step": 15280 + }, + { + "epoch": 2.82, + "learning_rate": 1.7948400668192785e-05, + "loss": 0.8361, + "step": 15300 + }, + { + "epoch": 2.83, + "learning_rate": 1.7577182453752395e-05, + "loss": 0.832, + "step": 15320 + }, + { + "epoch": 2.83, + "learning_rate": 1.7205964239312008e-05, + "loss": 0.8294, + "step": 15340 + }, + { + "epoch": 2.83, + "learning_rate": 1.683474602487162e-05, + "loss": 0.8215, + "step": 15360 + }, + { + "epoch": 2.84, + "learning_rate": 1.646352781043123e-05, + "loss": 0.8337, + "step": 15380 + }, + { + "epoch": 2.84, + "learning_rate": 1.6092309595990843e-05, + "loss": 0.8274, + "step": 15400 + }, + { + "epoch": 2.84, + "eval_loss": 0.810486376285553, + "eval_runtime": 1.3289, + "eval_samples_per_second": 0.753, + "eval_steps_per_second": 0.753, + "step": 15400 + }, + { + "epoch": 2.84, + "learning_rate": 1.5721091381550453e-05, + "loss": 0.8162, + "step": 15420 + }, + { + "epoch": 2.85, + "learning_rate": 1.5349873167110063e-05, + "loss": 0.8229, + "step": 15440 + }, + { + "epoch": 2.85, + "learning_rate": 1.4978654952669677e-05, + "loss": 0.8272, + "step": 15460 + }, + { + "epoch": 2.86, + "learning_rate": 1.4607436738229287e-05, + "loss": 0.8319, + "step": 15480 + }, + { + "epoch": 2.86, + "learning_rate": 1.4236218523788898e-05, + "loss": 0.8207, + "step": 15500 + }, + { + "epoch": 2.86, + "learning_rate": 1.3865000309348511e-05, + "loss": 0.8273, + "step": 15520 + }, + { + "epoch": 2.87, + "learning_rate": 1.3493782094908123e-05, + "loss": 0.825, + "step": 15540 + }, + { + "epoch": 2.87, + "learning_rate": 1.3122563880467734e-05, + "loss": 0.825, + "step": 15560 + }, + { + "epoch": 2.87, + "learning_rate": 1.2751345666027345e-05, + "loss": 0.8304, + "step": 15580 + }, + { + "epoch": 2.88, + "learning_rate": 1.2380127451586957e-05, + "loss": 0.8246, + "step": 15600 + }, + { + "epoch": 2.88, + "eval_loss": 0.8182739615440369, + "eval_runtime": 1.4484, + "eval_samples_per_second": 0.69, + "eval_steps_per_second": 0.69, + "step": 15600 + }, + { + "epoch": 2.88, + "learning_rate": 1.200890923714657e-05, + "loss": 0.8231, + "step": 15620 + }, + { + "epoch": 2.88, + "learning_rate": 1.163769102270618e-05, + "loss": 0.8232, + "step": 15640 + }, + { + "epoch": 2.89, + "learning_rate": 1.126647280826579e-05, + "loss": 0.827, + "step": 15660 + }, + { + "epoch": 2.89, + "learning_rate": 1.0895254593825404e-05, + "loss": 0.821, + "step": 15680 + }, + { + "epoch": 2.9, + "learning_rate": 1.0524036379385013e-05, + "loss": 0.8294, + "step": 15700 + }, + { + "epoch": 2.9, + "learning_rate": 1.0152818164944625e-05, + "loss": 0.8324, + "step": 15720 + }, + { + "epoch": 2.9, + "learning_rate": 9.781599950504238e-06, + "loss": 0.826, + "step": 15740 + }, + { + "epoch": 2.91, + "learning_rate": 9.410381736063849e-06, + "loss": 0.8277, + "step": 15760 + }, + { + "epoch": 2.91, + "learning_rate": 9.03916352162346e-06, + "loss": 0.8286, + "step": 15780 + }, + { + "epoch": 2.91, + "learning_rate": 8.667945307183072e-06, + "loss": 0.8229, + "step": 15800 + }, + { + "epoch": 2.91, + "eval_loss": 0.8074170351028442, + "eval_runtime": 1.3094, + "eval_samples_per_second": 0.764, + "eval_steps_per_second": 0.764, + "step": 15800 + }, + { + "epoch": 2.92, + "learning_rate": 8.296727092742683e-06, + "loss": 0.8249, + "step": 15820 + }, + { + "epoch": 2.92, + "learning_rate": 7.925508878302294e-06, + "loss": 0.824, + "step": 15840 + }, + { + "epoch": 2.93, + "learning_rate": 7.554290663861906e-06, + "loss": 0.8244, + "step": 15860 + }, + { + "epoch": 2.93, + "learning_rate": 7.183072449421518e-06, + "loss": 0.8257, + "step": 15880 + }, + { + "epoch": 2.93, + "learning_rate": 6.81185423498113e-06, + "loss": 0.8176, + "step": 15900 + }, + { + "epoch": 2.94, + "learning_rate": 6.44063602054074e-06, + "loss": 0.8287, + "step": 15920 + }, + { + "epoch": 2.94, + "learning_rate": 6.0694178061003525e-06, + "loss": 0.8272, + "step": 15940 + }, + { + "epoch": 2.94, + "learning_rate": 5.698199591659964e-06, + "loss": 0.8193, + "step": 15960 + }, + { + "epoch": 2.95, + "learning_rate": 5.326981377219574e-06, + "loss": 0.8067, + "step": 15980 + }, + { + "epoch": 2.95, + "learning_rate": 4.9557631627791865e-06, + "loss": 0.8213, + "step": 16000 + }, + { + "epoch": 2.95, + "eval_loss": 0.816523551940918, + "eval_runtime": 1.3844, + "eval_samples_per_second": 0.722, + "eval_steps_per_second": 0.722, + "step": 16000 + }, + { + "epoch": 2.95, + "learning_rate": 4.584544948338798e-06, + "loss": 0.8285, + "step": 16020 + }, + { + "epoch": 2.96, + "learning_rate": 4.213326733898409e-06, + "loss": 0.823, + "step": 16040 + }, + { + "epoch": 2.96, + "learning_rate": 3.842108519458021e-06, + "loss": 0.8245, + "step": 16060 + }, + { + "epoch": 2.97, + "learning_rate": 3.4708903050176327e-06, + "loss": 0.8222, + "step": 16080 + }, + { + "epoch": 2.97, + "learning_rate": 3.0996720905772444e-06, + "loss": 0.8244, + "step": 16100 + }, + { + "epoch": 2.97, + "learning_rate": 2.7284538761368553e-06, + "loss": 0.8269, + "step": 16120 + }, + { + "epoch": 2.98, + "learning_rate": 2.357235661696467e-06, + "loss": 0.831, + "step": 16140 + }, + { + "epoch": 2.98, + "learning_rate": 1.9860174472560784e-06, + "loss": 0.8272, + "step": 16160 + }, + { + "epoch": 2.98, + "learning_rate": 1.6147992328156901e-06, + "loss": 0.8302, + "step": 16180 + }, + { + "epoch": 2.99, + "learning_rate": 1.2435810183753015e-06, + "loss": 0.8074, + "step": 16200 + }, + { + "epoch": 2.99, + "eval_loss": 0.8098793625831604, + "eval_runtime": 1.5007, + "eval_samples_per_second": 0.666, + "eval_steps_per_second": 0.666, + "step": 16200 + } + ], + "max_steps": 16263, + "num_train_epochs": 3, + "total_flos": 1.0744586862539571e+19, + "trial_name": null, + "trial_params": null +}