{ "best_metric": 0.7872186303138733, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot7b/checkpoint-15200", "epoch": 2.9124353324391645, "global_step": 15200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.8213, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011999999999999999, "loss": 1.5494, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017999999999999998, "loss": 1.179, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.00023999999999999998, "loss": 1.1022, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.0003, "loss": 1.078, "step": 100 }, { "epoch": 0.02, "learning_rate": 0.00029961432152728675, "loss": 1.0347, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.0002992286430545735, "loss": 1.0169, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.00029884296458186025, "loss": 1.0088, "step": 160 }, { "epoch": 0.03, "learning_rate": 0.00029845728610914697, "loss": 0.9896, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.00029807160763643375, "loss": 0.99, "step": 200 }, { "epoch": 0.04, "eval_loss": 1.0032634735107422, "eval_runtime": 16.3952, "eval_samples_per_second": 121.987, "eval_steps_per_second": 1.952, "step": 200 }, { "epoch": 0.04, "learning_rate": 0.00029768592916372047, "loss": 0.9724, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.00029730025069100725, "loss": 0.9719, "step": 240 }, { "epoch": 0.05, "learning_rate": 0.00029691457221829397, "loss": 0.9652, "step": 260 }, { "epoch": 0.05, "learning_rate": 0.00029652889374558075, "loss": 0.9579, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00029614321527286747, "loss": 0.9532, "step": 300 }, { "epoch": 0.06, "learning_rate": 0.00029575753680015425, "loss": 0.9613, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.000295371858327441, "loss": 0.9473, "step": 340 }, { "epoch": 0.07, "learning_rate": 0.00029498617985472775, "loss": 0.9416, "step": 360 }, { "epoch": 0.07, "learning_rate": 0.0002946005013820145, "loss": 0.9386, "step": 380 }, { "epoch": 0.08, "learning_rate": 0.00029421482290930125, "loss": 0.9338, "step": 400 }, { "epoch": 0.08, "eval_loss": 0.957970380783081, "eval_runtime": 16.3897, "eval_samples_per_second": 122.028, "eval_steps_per_second": 1.952, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.000293829144436588, "loss": 0.937, "step": 420 }, { "epoch": 0.08, "learning_rate": 0.00029344346596387475, "loss": 0.9304, "step": 440 }, { "epoch": 0.09, "learning_rate": 0.0002930577874911615, "loss": 0.9323, "step": 460 }, { "epoch": 0.09, "learning_rate": 0.00029267210901844825, "loss": 0.9185, "step": 480 }, { "epoch": 0.1, "learning_rate": 0.000292286430545735, "loss": 0.9273, "step": 500 }, { "epoch": 0.1, "learning_rate": 0.00029190075207302175, "loss": 0.922, "step": 520 }, { "epoch": 0.1, "learning_rate": 0.0002915150736003085, "loss": 0.9146, "step": 540 }, { "epoch": 0.11, "learning_rate": 0.00029112939512759525, "loss": 0.9129, "step": 560 }, { "epoch": 0.11, "learning_rate": 0.000290743716654882, "loss": 0.9146, "step": 580 }, { "epoch": 0.11, "learning_rate": 0.0002903580381821688, "loss": 0.9078, "step": 600 }, { "epoch": 0.11, "eval_loss": 0.9345074892044067, "eval_runtime": 16.4049, "eval_samples_per_second": 121.914, "eval_steps_per_second": 1.951, "step": 600 }, { "epoch": 0.12, "learning_rate": 0.0002899723597094555, "loss": 0.9004, "step": 620 }, { "epoch": 0.12, "learning_rate": 0.0002895866812367423, "loss": 0.9042, "step": 640 }, { "epoch": 0.13, "learning_rate": 0.000289201002764029, "loss": 0.9028, "step": 660 }, { "epoch": 0.13, "learning_rate": 0.0002888153242913158, "loss": 0.8889, "step": 680 }, { "epoch": 0.13, "learning_rate": 0.0002884296458186025, "loss": 0.8935, "step": 700 }, { "epoch": 0.14, "learning_rate": 0.0002880439673458893, "loss": 0.9024, "step": 720 }, { "epoch": 0.14, "learning_rate": 0.000287658288873176, "loss": 0.8922, "step": 740 }, { "epoch": 0.15, "learning_rate": 0.0002872726104004628, "loss": 0.8896, "step": 760 }, { "epoch": 0.15, "learning_rate": 0.0002868869319277495, "loss": 0.8907, "step": 780 }, { "epoch": 0.15, "learning_rate": 0.0002865012534550363, "loss": 0.8922, "step": 800 }, { "epoch": 0.15, "eval_loss": 0.9149895310401917, "eval_runtime": 16.4499, "eval_samples_per_second": 121.581, "eval_steps_per_second": 1.945, "step": 800 }, { "epoch": 0.16, "learning_rate": 0.0002861155749823231, "loss": 0.8867, "step": 820 }, { "epoch": 0.16, "learning_rate": 0.0002857298965096098, "loss": 0.891, "step": 840 }, { "epoch": 0.16, "learning_rate": 0.00028534421803689657, "loss": 0.8882, "step": 860 }, { "epoch": 0.17, "learning_rate": 0.0002849585395641833, "loss": 0.8835, "step": 880 }, { "epoch": 0.17, "learning_rate": 0.00028457286109147007, "loss": 0.8798, "step": 900 }, { "epoch": 0.18, "learning_rate": 0.0002841871826187568, "loss": 0.8784, "step": 920 }, { "epoch": 0.18, "learning_rate": 0.00028380150414604357, "loss": 0.8841, "step": 940 }, { "epoch": 0.18, "learning_rate": 0.0002834158256733303, "loss": 0.8787, "step": 960 }, { "epoch": 0.19, "learning_rate": 0.00028303014720061707, "loss": 0.8693, "step": 980 }, { "epoch": 0.19, "learning_rate": 0.0002826444687279038, "loss": 0.8711, "step": 1000 }, { "epoch": 0.19, "eval_loss": 0.9027432799339294, "eval_runtime": 16.447, "eval_samples_per_second": 121.603, "eval_steps_per_second": 1.946, "step": 1000 }, { "epoch": 0.2, "learning_rate": 0.00028225879025519057, "loss": 0.876, "step": 1020 }, { "epoch": 0.2, "learning_rate": 0.0002818731117824773, "loss": 0.8749, "step": 1040 }, { "epoch": 0.2, "learning_rate": 0.00028148743330976407, "loss": 0.877, "step": 1060 }, { "epoch": 0.21, "learning_rate": 0.00028110175483705085, "loss": 0.8754, "step": 1080 }, { "epoch": 0.21, "learning_rate": 0.00028071607636433757, "loss": 0.8792, "step": 1100 }, { "epoch": 0.21, "learning_rate": 0.00028033039789162435, "loss": 0.8701, "step": 1120 }, { "epoch": 0.22, "learning_rate": 0.00027994471941891107, "loss": 0.8667, "step": 1140 }, { "epoch": 0.22, "learning_rate": 0.00027955904094619785, "loss": 0.8769, "step": 1160 }, { "epoch": 0.23, "learning_rate": 0.00027917336247348457, "loss": 0.8734, "step": 1180 }, { "epoch": 0.23, "learning_rate": 0.0002787876840007713, "loss": 0.8708, "step": 1200 }, { "epoch": 0.23, "eval_loss": 0.8911536335945129, "eval_runtime": 16.423, "eval_samples_per_second": 121.78, "eval_steps_per_second": 1.948, "step": 1200 }, { "epoch": 0.23, "learning_rate": 0.00027840200552805807, "loss": 0.8673, "step": 1220 }, { "epoch": 0.24, "learning_rate": 0.00027801632705534485, "loss": 0.8618, "step": 1240 }, { "epoch": 0.24, "learning_rate": 0.00027763064858263157, "loss": 0.8739, "step": 1260 }, { "epoch": 0.25, "learning_rate": 0.00027724497010991834, "loss": 0.8608, "step": 1280 }, { "epoch": 0.25, "learning_rate": 0.0002768592916372051, "loss": 0.8631, "step": 1300 }, { "epoch": 0.25, "learning_rate": 0.00027647361316449184, "loss": 0.8547, "step": 1320 }, { "epoch": 0.26, "learning_rate": 0.0002760879346917786, "loss": 0.8589, "step": 1340 }, { "epoch": 0.26, "learning_rate": 0.00027570225621906534, "loss": 0.8615, "step": 1360 }, { "epoch": 0.26, "learning_rate": 0.0002753165777463521, "loss": 0.8644, "step": 1380 }, { "epoch": 0.27, "learning_rate": 0.00027493089927363884, "loss": 0.8524, "step": 1400 }, { "epoch": 0.27, "eval_loss": 0.8813066482543945, "eval_runtime": 16.4628, "eval_samples_per_second": 121.486, "eval_steps_per_second": 1.944, "step": 1400 }, { "epoch": 0.27, "learning_rate": 0.00027454522080092557, "loss": 0.8562, "step": 1420 }, { "epoch": 0.28, "learning_rate": 0.00027415954232821234, "loss": 0.8547, "step": 1440 }, { "epoch": 0.28, "learning_rate": 0.0002737738638554991, "loss": 0.8599, "step": 1460 }, { "epoch": 0.28, "learning_rate": 0.00027338818538278584, "loss": 0.8491, "step": 1480 }, { "epoch": 0.29, "learning_rate": 0.0002730025069100726, "loss": 0.8496, "step": 1500 }, { "epoch": 0.29, "learning_rate": 0.0002726168284373594, "loss": 0.8594, "step": 1520 }, { "epoch": 0.3, "learning_rate": 0.0002722311499646461, "loss": 0.8512, "step": 1540 }, { "epoch": 0.3, "learning_rate": 0.0002718454714919329, "loss": 0.8441, "step": 1560 }, { "epoch": 0.3, "learning_rate": 0.0002714597930192196, "loss": 0.8621, "step": 1580 }, { "epoch": 0.31, "learning_rate": 0.0002710741145465064, "loss": 0.8525, "step": 1600 }, { "epoch": 0.31, "eval_loss": 0.8728711009025574, "eval_runtime": 16.4389, "eval_samples_per_second": 121.663, "eval_steps_per_second": 1.947, "step": 1600 }, { "epoch": 0.31, "learning_rate": 0.0002706884360737931, "loss": 0.852, "step": 1620 }, { "epoch": 0.31, "learning_rate": 0.00027030275760107984, "loss": 0.8553, "step": 1640 }, { "epoch": 0.32, "learning_rate": 0.0002699170791283666, "loss": 0.8445, "step": 1660 }, { "epoch": 0.32, "learning_rate": 0.0002695314006556534, "loss": 0.8518, "step": 1680 }, { "epoch": 0.33, "learning_rate": 0.0002691457221829401, "loss": 0.8318, "step": 1700 }, { "epoch": 0.33, "learning_rate": 0.0002687600437102269, "loss": 0.8492, "step": 1720 }, { "epoch": 0.33, "learning_rate": 0.0002683743652375136, "loss": 0.8475, "step": 1740 }, { "epoch": 0.34, "learning_rate": 0.0002679886867648004, "loss": 0.8437, "step": 1760 }, { "epoch": 0.34, "learning_rate": 0.00026760300829208717, "loss": 0.8355, "step": 1780 }, { "epoch": 0.34, "learning_rate": 0.0002672173298193739, "loss": 0.8486, "step": 1800 }, { "epoch": 0.34, "eval_loss": 0.8663893938064575, "eval_runtime": 16.4511, "eval_samples_per_second": 121.572, "eval_steps_per_second": 1.945, "step": 1800 }, { "epoch": 0.35, "learning_rate": 0.00026683165134666067, "loss": 0.8449, "step": 1820 }, { "epoch": 0.35, "learning_rate": 0.0002664459728739474, "loss": 0.853, "step": 1840 }, { "epoch": 0.36, "learning_rate": 0.00026606029440123417, "loss": 0.8472, "step": 1860 }, { "epoch": 0.36, "learning_rate": 0.0002656746159285209, "loss": 0.83, "step": 1880 }, { "epoch": 0.36, "learning_rate": 0.0002652889374558076, "loss": 0.8398, "step": 1900 }, { "epoch": 0.37, "learning_rate": 0.0002649032589830944, "loss": 0.8337, "step": 1920 }, { "epoch": 0.37, "learning_rate": 0.00026451758051038117, "loss": 0.8314, "step": 1940 }, { "epoch": 0.38, "learning_rate": 0.0002641319020376679, "loss": 0.8314, "step": 1960 }, { "epoch": 0.38, "learning_rate": 0.00026374622356495467, "loss": 0.845, "step": 1980 }, { "epoch": 0.38, "learning_rate": 0.00026336054509224144, "loss": 0.8294, "step": 2000 }, { "epoch": 0.38, "eval_loss": 0.8619188666343689, "eval_runtime": 16.4444, "eval_samples_per_second": 121.622, "eval_steps_per_second": 1.946, "step": 2000 }, { "epoch": 0.39, "learning_rate": 0.00026297486661952817, "loss": 0.8404, "step": 2020 }, { "epoch": 0.39, "learning_rate": 0.00026258918814681494, "loss": 0.839, "step": 2040 }, { "epoch": 0.39, "learning_rate": 0.00026220350967410167, "loss": 0.84, "step": 2060 }, { "epoch": 0.4, "learning_rate": 0.00026181783120138844, "loss": 0.8442, "step": 2080 }, { "epoch": 0.4, "learning_rate": 0.00026143215272867517, "loss": 0.8443, "step": 2100 }, { "epoch": 0.41, "learning_rate": 0.0002610464742559619, "loss": 0.8301, "step": 2120 }, { "epoch": 0.41, "learning_rate": 0.00026066079578324867, "loss": 0.8302, "step": 2140 }, { "epoch": 0.41, "learning_rate": 0.00026027511731053544, "loss": 0.836, "step": 2160 }, { "epoch": 0.42, "learning_rate": 0.00025988943883782216, "loss": 0.8277, "step": 2180 }, { "epoch": 0.42, "learning_rate": 0.00025950376036510894, "loss": 0.8335, "step": 2200 }, { "epoch": 0.42, "eval_loss": 0.8562669157981873, "eval_runtime": 16.4486, "eval_samples_per_second": 121.591, "eval_steps_per_second": 1.945, "step": 2200 }, { "epoch": 0.43, "learning_rate": 0.00025911808189239566, "loss": 0.8267, "step": 2220 }, { "epoch": 0.43, "learning_rate": 0.00025873240341968244, "loss": 0.8267, "step": 2240 }, { "epoch": 0.43, "learning_rate": 0.0002583467249469692, "loss": 0.8293, "step": 2260 }, { "epoch": 0.44, "learning_rate": 0.00025796104647425594, "loss": 0.836, "step": 2280 }, { "epoch": 0.44, "learning_rate": 0.0002575753680015427, "loss": 0.8255, "step": 2300 }, { "epoch": 0.44, "learning_rate": 0.00025718968952882944, "loss": 0.8177, "step": 2320 }, { "epoch": 0.45, "learning_rate": 0.00025680401105611616, "loss": 0.8272, "step": 2340 }, { "epoch": 0.45, "learning_rate": 0.00025641833258340294, "loss": 0.831, "step": 2360 }, { "epoch": 0.46, "learning_rate": 0.00025603265411068966, "loss": 0.819, "step": 2380 }, { "epoch": 0.46, "learning_rate": 0.00025564697563797644, "loss": 0.8216, "step": 2400 }, { "epoch": 0.46, "eval_loss": 0.8516544103622437, "eval_runtime": 16.476, "eval_samples_per_second": 121.389, "eval_steps_per_second": 1.942, "step": 2400 }, { "epoch": 0.46, "learning_rate": 0.0002552612971652632, "loss": 0.8305, "step": 2420 }, { "epoch": 0.47, "learning_rate": 0.00025487561869254994, "loss": 0.8305, "step": 2440 }, { "epoch": 0.47, "learning_rate": 0.0002544899402198367, "loss": 0.8302, "step": 2460 }, { "epoch": 0.48, "learning_rate": 0.0002541042617471235, "loss": 0.824, "step": 2480 }, { "epoch": 0.48, "learning_rate": 0.0002537185832744102, "loss": 0.8315, "step": 2500 }, { "epoch": 0.48, "learning_rate": 0.000253332904801697, "loss": 0.8224, "step": 2520 }, { "epoch": 0.49, "learning_rate": 0.0002529472263289837, "loss": 0.8229, "step": 2540 }, { "epoch": 0.49, "learning_rate": 0.00025256154785627044, "loss": 0.8156, "step": 2560 }, { "epoch": 0.49, "learning_rate": 0.0002521758693835572, "loss": 0.8319, "step": 2580 }, { "epoch": 0.5, "learning_rate": 0.00025179019091084394, "loss": 0.8222, "step": 2600 }, { "epoch": 0.5, "eval_loss": 0.8481459021568298, "eval_runtime": 16.453, "eval_samples_per_second": 121.558, "eval_steps_per_second": 1.945, "step": 2600 }, { "epoch": 0.5, "learning_rate": 0.0002514045124381307, "loss": 0.8205, "step": 2620 }, { "epoch": 0.51, "learning_rate": 0.0002510188339654175, "loss": 0.8267, "step": 2640 }, { "epoch": 0.51, "learning_rate": 0.0002506331554927042, "loss": 0.8116, "step": 2660 }, { "epoch": 0.51, "learning_rate": 0.000250247477019991, "loss": 0.8239, "step": 2680 }, { "epoch": 0.52, "learning_rate": 0.00024986179854727777, "loss": 0.8126, "step": 2700 }, { "epoch": 0.52, "learning_rate": 0.0002494761200745645, "loss": 0.8226, "step": 2720 }, { "epoch": 0.53, "learning_rate": 0.00024909044160185127, "loss": 0.8173, "step": 2740 }, { "epoch": 0.53, "learning_rate": 0.000248704763129138, "loss": 0.8227, "step": 2760 }, { "epoch": 0.53, "learning_rate": 0.0002483190846564247, "loss": 0.8129, "step": 2780 }, { "epoch": 0.54, "learning_rate": 0.0002479334061837115, "loss": 0.8164, "step": 2800 }, { "epoch": 0.54, "eval_loss": 0.8439643979072571, "eval_runtime": 16.4767, "eval_samples_per_second": 121.384, "eval_steps_per_second": 1.942, "step": 2800 }, { "epoch": 0.54, "learning_rate": 0.0002475477277109982, "loss": 0.807, "step": 2820 }, { "epoch": 0.54, "learning_rate": 0.000247162049238285, "loss": 0.8126, "step": 2840 }, { "epoch": 0.55, "learning_rate": 0.00024677637076557176, "loss": 0.8193, "step": 2860 }, { "epoch": 0.55, "learning_rate": 0.0002463906922928585, "loss": 0.8091, "step": 2880 }, { "epoch": 0.56, "learning_rate": 0.00024600501382014526, "loss": 0.8147, "step": 2900 }, { "epoch": 0.56, "learning_rate": 0.000245619335347432, "loss": 0.8207, "step": 2920 }, { "epoch": 0.56, "learning_rate": 0.00024523365687471876, "loss": 0.8087, "step": 2940 }, { "epoch": 0.57, "learning_rate": 0.00024484797840200554, "loss": 0.8198, "step": 2960 }, { "epoch": 0.57, "learning_rate": 0.00024446229992929226, "loss": 0.8087, "step": 2980 }, { "epoch": 0.57, "learning_rate": 0.000244076621456579, "loss": 0.8182, "step": 3000 }, { "epoch": 0.57, "eval_loss": 0.8408891558647156, "eval_runtime": 16.4801, "eval_samples_per_second": 121.358, "eval_steps_per_second": 1.942, "step": 3000 }, { "epoch": 0.58, "learning_rate": 0.0002436909429838658, "loss": 0.8188, "step": 3020 }, { "epoch": 0.58, "learning_rate": 0.0002433052645111525, "loss": 0.8082, "step": 3040 }, { "epoch": 0.59, "learning_rate": 0.00024291958603843926, "loss": 0.8171, "step": 3060 }, { "epoch": 0.59, "learning_rate": 0.000242533907565726, "loss": 0.8088, "step": 3080 }, { "epoch": 0.59, "learning_rate": 0.00024214822909301276, "loss": 0.8148, "step": 3100 }, { "epoch": 0.6, "learning_rate": 0.00024176255062029954, "loss": 0.8122, "step": 3120 }, { "epoch": 0.6, "learning_rate": 0.00024137687214758626, "loss": 0.811, "step": 3140 }, { "epoch": 0.61, "learning_rate": 0.00024099119367487304, "loss": 0.8179, "step": 3160 }, { "epoch": 0.61, "learning_rate": 0.0002406055152021598, "loss": 0.8029, "step": 3180 }, { "epoch": 0.61, "learning_rate": 0.0002402198367294465, "loss": 0.8143, "step": 3200 }, { "epoch": 0.61, "eval_loss": 0.837196946144104, "eval_runtime": 16.4913, "eval_samples_per_second": 121.276, "eval_steps_per_second": 1.94, "step": 3200 }, { "epoch": 0.62, "learning_rate": 0.0002398341582567333, "loss": 0.7969, "step": 3220 }, { "epoch": 0.62, "learning_rate": 0.00023944847978402, "loss": 0.8158, "step": 3240 }, { "epoch": 0.62, "learning_rate": 0.0002390628013113068, "loss": 0.8019, "step": 3260 }, { "epoch": 0.63, "learning_rate": 0.00023867712283859354, "loss": 0.8042, "step": 3280 }, { "epoch": 0.63, "learning_rate": 0.0002382914443658803, "loss": 0.8022, "step": 3300 }, { "epoch": 0.64, "learning_rate": 0.00023790576589316704, "loss": 0.8043, "step": 3320 }, { "epoch": 0.64, "learning_rate": 0.0002375200874204538, "loss": 0.8106, "step": 3340 }, { "epoch": 0.64, "learning_rate": 0.00023713440894774054, "loss": 0.8146, "step": 3360 }, { "epoch": 0.65, "learning_rate": 0.0002367487304750273, "loss": 0.8004, "step": 3380 }, { "epoch": 0.65, "learning_rate": 0.00023636305200231404, "loss": 0.8096, "step": 3400 }, { "epoch": 0.65, "eval_loss": 0.8347571492195129, "eval_runtime": 16.4822, "eval_samples_per_second": 121.343, "eval_steps_per_second": 1.941, "step": 3400 }, { "epoch": 0.66, "learning_rate": 0.0002359773735296008, "loss": 0.8226, "step": 3420 }, { "epoch": 0.66, "learning_rate": 0.00023559169505688756, "loss": 0.8083, "step": 3440 }, { "epoch": 0.66, "learning_rate": 0.00023520601658417428, "loss": 0.8168, "step": 3460 }, { "epoch": 0.67, "learning_rate": 0.00023482033811146106, "loss": 0.8112, "step": 3480 }, { "epoch": 0.67, "learning_rate": 0.0002344346596387478, "loss": 0.8131, "step": 3500 }, { "epoch": 0.67, "learning_rate": 0.00023404898116603456, "loss": 0.8097, "step": 3520 }, { "epoch": 0.68, "learning_rate": 0.0002336633026933213, "loss": 0.804, "step": 3540 }, { "epoch": 0.68, "learning_rate": 0.00023327762422060806, "loss": 0.8085, "step": 3560 }, { "epoch": 0.69, "learning_rate": 0.0002328919457478948, "loss": 0.7992, "step": 3580 }, { "epoch": 0.69, "learning_rate": 0.0002325062672751816, "loss": 0.8124, "step": 3600 }, { "epoch": 0.69, "eval_loss": 0.8324670791625977, "eval_runtime": 16.4936, "eval_samples_per_second": 121.259, "eval_steps_per_second": 1.94, "step": 3600 }, { "epoch": 0.69, "learning_rate": 0.0002321205888024683, "loss": 0.8024, "step": 3620 }, { "epoch": 0.7, "learning_rate": 0.0002317349103297551, "loss": 0.8032, "step": 3640 }, { "epoch": 0.7, "learning_rate": 0.00023134923185704184, "loss": 0.8065, "step": 3660 }, { "epoch": 0.71, "learning_rate": 0.00023096355338432856, "loss": 0.8106, "step": 3680 }, { "epoch": 0.71, "learning_rate": 0.00023057787491161534, "loss": 0.8009, "step": 3700 }, { "epoch": 0.71, "learning_rate": 0.00023019219643890206, "loss": 0.816, "step": 3720 }, { "epoch": 0.72, "learning_rate": 0.00022980651796618884, "loss": 0.8103, "step": 3740 }, { "epoch": 0.72, "learning_rate": 0.00022942083949347559, "loss": 0.8099, "step": 3760 }, { "epoch": 0.72, "learning_rate": 0.00022903516102076233, "loss": 0.8085, "step": 3780 }, { "epoch": 0.73, "learning_rate": 0.00022864948254804908, "loss": 0.8044, "step": 3800 }, { "epoch": 0.73, "eval_loss": 0.830141544342041, "eval_runtime": 16.4845, "eval_samples_per_second": 121.326, "eval_steps_per_second": 1.941, "step": 3800 }, { "epoch": 0.73, "learning_rate": 0.00022826380407533586, "loss": 0.7969, "step": 3820 }, { "epoch": 0.74, "learning_rate": 0.00022787812560262258, "loss": 0.8029, "step": 3840 }, { "epoch": 0.74, "learning_rate": 0.00022749244712990936, "loss": 0.7921, "step": 3860 }, { "epoch": 0.74, "learning_rate": 0.0002271067686571961, "loss": 0.8051, "step": 3880 }, { "epoch": 0.75, "learning_rate": 0.00022672109018448283, "loss": 0.807, "step": 3900 }, { "epoch": 0.75, "learning_rate": 0.0002263354117117696, "loss": 0.8042, "step": 3920 }, { "epoch": 0.75, "learning_rate": 0.00022594973323905633, "loss": 0.7947, "step": 3940 }, { "epoch": 0.76, "learning_rate": 0.0002255640547663431, "loss": 0.7972, "step": 3960 }, { "epoch": 0.76, "learning_rate": 0.00022517837629362986, "loss": 0.8038, "step": 3980 }, { "epoch": 0.77, "learning_rate": 0.0002247926978209166, "loss": 0.8064, "step": 4000 }, { "epoch": 0.77, "eval_loss": 0.828279435634613, "eval_runtime": 16.4904, "eval_samples_per_second": 121.283, "eval_steps_per_second": 1.941, "step": 4000 }, { "epoch": 0.77, "learning_rate": 0.00022440701934820336, "loss": 0.8032, "step": 4020 }, { "epoch": 0.77, "learning_rate": 0.00022402134087549014, "loss": 0.7934, "step": 4040 }, { "epoch": 0.78, "learning_rate": 0.00022363566240277686, "loss": 0.7919, "step": 4060 }, { "epoch": 0.78, "learning_rate": 0.00022324998393006364, "loss": 0.8011, "step": 4080 }, { "epoch": 0.79, "learning_rate": 0.00022286430545735036, "loss": 0.8026, "step": 4100 }, { "epoch": 0.79, "learning_rate": 0.0002224786269846371, "loss": 0.804, "step": 4120 }, { "epoch": 0.79, "learning_rate": 0.00022209294851192388, "loss": 0.8122, "step": 4140 }, { "epoch": 0.8, "learning_rate": 0.0002217072700392106, "loss": 0.7932, "step": 4160 }, { "epoch": 0.8, "learning_rate": 0.00022132159156649738, "loss": 0.7911, "step": 4180 }, { "epoch": 0.8, "learning_rate": 0.00022093591309378413, "loss": 0.8012, "step": 4200 }, { "epoch": 0.8, "eval_loss": 0.8261794447898865, "eval_runtime": 16.4921, "eval_samples_per_second": 121.27, "eval_steps_per_second": 1.94, "step": 4200 }, { "epoch": 0.81, "learning_rate": 0.00022055023462107088, "loss": 0.7989, "step": 4220 }, { "epoch": 0.81, "learning_rate": 0.00022016455614835763, "loss": 0.8031, "step": 4240 }, { "epoch": 0.82, "learning_rate": 0.00021977887767564438, "loss": 0.8066, "step": 4260 }, { "epoch": 0.82, "learning_rate": 0.00021939319920293113, "loss": 0.7964, "step": 4280 }, { "epoch": 0.82, "learning_rate": 0.0002190075207302179, "loss": 0.7947, "step": 4300 }, { "epoch": 0.83, "learning_rate": 0.00021862184225750463, "loss": 0.8035, "step": 4320 }, { "epoch": 0.83, "learning_rate": 0.00021823616378479138, "loss": 0.8029, "step": 4340 }, { "epoch": 0.84, "learning_rate": 0.00021785048531207816, "loss": 0.7941, "step": 4360 }, { "epoch": 0.84, "learning_rate": 0.00021746480683936488, "loss": 0.7934, "step": 4380 }, { "epoch": 0.84, "learning_rate": 0.00021707912836665166, "loss": 0.7946, "step": 4400 }, { "epoch": 0.84, "eval_loss": 0.823946475982666, "eval_runtime": 16.4887, "eval_samples_per_second": 121.295, "eval_steps_per_second": 1.941, "step": 4400 }, { "epoch": 0.85, "learning_rate": 0.00021669344989393838, "loss": 0.7974, "step": 4420 }, { "epoch": 0.85, "learning_rate": 0.00021630777142122516, "loss": 0.7962, "step": 4440 }, { "epoch": 0.85, "learning_rate": 0.0002159220929485119, "loss": 0.7946, "step": 4460 }, { "epoch": 0.86, "learning_rate": 0.00021553641447579866, "loss": 0.7818, "step": 4480 }, { "epoch": 0.86, "learning_rate": 0.0002151507360030854, "loss": 0.803, "step": 4500 }, { "epoch": 0.87, "learning_rate": 0.00021476505753037218, "loss": 0.7851, "step": 4520 }, { "epoch": 0.87, "learning_rate": 0.0002143793790576589, "loss": 0.7984, "step": 4540 }, { "epoch": 0.87, "learning_rate": 0.00021399370058494568, "loss": 0.7973, "step": 4560 }, { "epoch": 0.88, "learning_rate": 0.0002136080221122324, "loss": 0.782, "step": 4580 }, { "epoch": 0.88, "learning_rate": 0.00021322234363951916, "loss": 0.7951, "step": 4600 }, { "epoch": 0.88, "eval_loss": 0.8220962285995483, "eval_runtime": 16.5191, "eval_samples_per_second": 121.072, "eval_steps_per_second": 1.937, "step": 4600 }, { "epoch": 0.89, "learning_rate": 0.00021283666516680593, "loss": 0.7947, "step": 4620 }, { "epoch": 0.89, "learning_rate": 0.00021245098669409266, "loss": 0.7957, "step": 4640 }, { "epoch": 0.89, "learning_rate": 0.00021206530822137943, "loss": 0.797, "step": 4660 }, { "epoch": 0.9, "learning_rate": 0.00021167962974866618, "loss": 0.8097, "step": 4680 }, { "epoch": 0.9, "learning_rate": 0.00021129395127595293, "loss": 0.7894, "step": 4700 }, { "epoch": 0.9, "learning_rate": 0.00021090827280323968, "loss": 0.7789, "step": 4720 }, { "epoch": 0.91, "learning_rate": 0.0002105225943305264, "loss": 0.7949, "step": 4740 }, { "epoch": 0.91, "learning_rate": 0.00021013691585781318, "loss": 0.7895, "step": 4760 }, { "epoch": 0.92, "learning_rate": 0.00020975123738509996, "loss": 0.8036, "step": 4780 }, { "epoch": 0.92, "learning_rate": 0.00020936555891238668, "loss": 0.7966, "step": 4800 }, { "epoch": 0.92, "eval_loss": 0.8209095597267151, "eval_runtime": 16.5035, "eval_samples_per_second": 121.187, "eval_steps_per_second": 1.939, "step": 4800 }, { "epoch": 0.92, "learning_rate": 0.00020897988043967343, "loss": 0.7892, "step": 4820 }, { "epoch": 0.93, "learning_rate": 0.0002085942019669602, "loss": 0.7825, "step": 4840 }, { "epoch": 0.93, "learning_rate": 0.00020820852349424693, "loss": 0.7937, "step": 4860 }, { "epoch": 0.94, "learning_rate": 0.0002078228450215337, "loss": 0.7893, "step": 4880 }, { "epoch": 0.94, "learning_rate": 0.00020743716654882043, "loss": 0.7944, "step": 4900 }, { "epoch": 0.94, "learning_rate": 0.0002070514880761072, "loss": 0.7973, "step": 4920 }, { "epoch": 0.95, "learning_rate": 0.00020666580960339396, "loss": 0.7919, "step": 4940 }, { "epoch": 0.95, "learning_rate": 0.0002062801311306807, "loss": 0.7918, "step": 4960 }, { "epoch": 0.95, "learning_rate": 0.00020589445265796746, "loss": 0.7901, "step": 4980 }, { "epoch": 0.96, "learning_rate": 0.00020550877418525423, "loss": 0.7891, "step": 5000 }, { "epoch": 0.96, "eval_loss": 0.8192855715751648, "eval_runtime": 16.5248, "eval_samples_per_second": 121.03, "eval_steps_per_second": 1.936, "step": 5000 }, { "epoch": 0.96, "learning_rate": 0.00020512309571254096, "loss": 0.7813, "step": 5020 }, { "epoch": 0.97, "learning_rate": 0.0002047374172398277, "loss": 0.7831, "step": 5040 }, { "epoch": 0.97, "learning_rate": 0.00020435173876711445, "loss": 0.7911, "step": 5060 }, { "epoch": 0.97, "learning_rate": 0.0002039660602944012, "loss": 0.7816, "step": 5080 }, { "epoch": 0.98, "learning_rate": 0.00020358038182168798, "loss": 0.7915, "step": 5100 }, { "epoch": 0.98, "learning_rate": 0.0002031947033489747, "loss": 0.791, "step": 5120 }, { "epoch": 0.98, "learning_rate": 0.00020280902487626148, "loss": 0.7851, "step": 5140 }, { "epoch": 0.99, "learning_rate": 0.00020242334640354823, "loss": 0.7859, "step": 5160 }, { "epoch": 0.99, "learning_rate": 0.00020203766793083498, "loss": 0.7888, "step": 5180 }, { "epoch": 1.0, "learning_rate": 0.00020165198945812173, "loss": 0.7854, "step": 5200 }, { "epoch": 1.0, "eval_loss": 0.8173321485519409, "eval_runtime": 16.5042, "eval_samples_per_second": 121.182, "eval_steps_per_second": 1.939, "step": 5200 }, { "epoch": 1.0, "learning_rate": 0.0002012663109854085, "loss": 0.7888, "step": 5220 }, { "epoch": 1.0, "learning_rate": 0.00020088063251269523, "loss": 0.7893, "step": 5240 }, { "epoch": 1.01, "learning_rate": 0.00020049495403998198, "loss": 0.7817, "step": 5260 }, { "epoch": 1.01, "learning_rate": 0.00020010927556726873, "loss": 0.7755, "step": 5280 }, { "epoch": 1.02, "learning_rate": 0.00019972359709455548, "loss": 0.7839, "step": 5300 }, { "epoch": 1.02, "learning_rate": 0.00019933791862184226, "loss": 0.7911, "step": 5320 }, { "epoch": 1.02, "learning_rate": 0.00019895224014912898, "loss": 0.7819, "step": 5340 }, { "epoch": 1.03, "learning_rate": 0.00019856656167641576, "loss": 0.7802, "step": 5360 }, { "epoch": 1.03, "learning_rate": 0.0001981808832037025, "loss": 0.7847, "step": 5380 }, { "epoch": 1.03, "learning_rate": 0.00019779520473098925, "loss": 0.7824, "step": 5400 }, { "epoch": 1.03, "eval_loss": 0.8163856267929077, "eval_runtime": 16.5306, "eval_samples_per_second": 120.988, "eval_steps_per_second": 1.936, "step": 5400 }, { "epoch": 1.04, "learning_rate": 0.00019742881018191167, "loss": 0.7757, "step": 5420 }, { "epoch": 1.04, "learning_rate": 0.0001970431317091984, "loss": 0.786, "step": 5440 }, { "epoch": 1.05, "learning_rate": 0.00019665745323648517, "loss": 0.7923, "step": 5460 }, { "epoch": 1.05, "learning_rate": 0.00019627177476377191, "loss": 0.791, "step": 5480 }, { "epoch": 1.05, "learning_rate": 0.00019588609629105866, "loss": 0.7863, "step": 5500 }, { "epoch": 1.06, "learning_rate": 0.00019550041781834541, "loss": 0.7879, "step": 5520 }, { "epoch": 1.06, "learning_rate": 0.0001951147393456322, "loss": 0.7924, "step": 5540 }, { "epoch": 1.07, "learning_rate": 0.00019472906087291891, "loss": 0.7918, "step": 5560 }, { "epoch": 1.07, "learning_rate": 0.0001943433824002057, "loss": 0.792, "step": 5580 }, { "epoch": 1.07, "learning_rate": 0.0001939577039274924, "loss": 0.7784, "step": 5600 }, { "epoch": 1.07, "eval_loss": 0.8148436546325684, "eval_runtime": 16.5424, "eval_samples_per_second": 120.901, "eval_steps_per_second": 1.934, "step": 5600 }, { "epoch": 1.08, "learning_rate": 0.0001935720254547792, "loss": 0.7903, "step": 5620 }, { "epoch": 1.08, "learning_rate": 0.00019318634698206594, "loss": 0.785, "step": 5640 }, { "epoch": 1.08, "learning_rate": 0.00019280066850935266, "loss": 0.7916, "step": 5660 }, { "epoch": 1.09, "learning_rate": 0.00019241499003663944, "loss": 0.779, "step": 5680 }, { "epoch": 1.09, "learning_rate": 0.0001920293115639262, "loss": 0.7909, "step": 5700 }, { "epoch": 1.1, "learning_rate": 0.00019164363309121294, "loss": 0.7798, "step": 5720 }, { "epoch": 1.1, "learning_rate": 0.0001912579546184997, "loss": 0.7846, "step": 5740 }, { "epoch": 1.1, "learning_rate": 0.00019087227614578647, "loss": 0.7887, "step": 5760 }, { "epoch": 1.11, "learning_rate": 0.0001904865976730732, "loss": 0.7802, "step": 5780 }, { "epoch": 1.11, "learning_rate": 0.00019010091920035997, "loss": 0.7891, "step": 5800 }, { "epoch": 1.11, "eval_loss": 0.8130878806114197, "eval_runtime": 16.5056, "eval_samples_per_second": 121.171, "eval_steps_per_second": 1.939, "step": 5800 }, { "epoch": 1.12, "learning_rate": 0.0001897152407276467, "loss": 0.7819, "step": 5820 }, { "epoch": 1.12, "learning_rate": 0.00018932956225493346, "loss": 0.7945, "step": 5840 }, { "epoch": 1.12, "learning_rate": 0.00018894388378222021, "loss": 0.784, "step": 5860 }, { "epoch": 1.13, "learning_rate": 0.00018855820530950694, "loss": 0.7838, "step": 5880 }, { "epoch": 1.13, "learning_rate": 0.00018817252683679371, "loss": 0.7841, "step": 5900 }, { "epoch": 1.13, "learning_rate": 0.0001877868483640805, "loss": 0.7909, "step": 5920 }, { "epoch": 1.14, "learning_rate": 0.0001874011698913672, "loss": 0.7775, "step": 5940 }, { "epoch": 1.14, "learning_rate": 0.00018701549141865396, "loss": 0.7827, "step": 5960 }, { "epoch": 1.15, "learning_rate": 0.0001866298129459407, "loss": 0.7866, "step": 5980 }, { "epoch": 1.15, "learning_rate": 0.00018624413447322746, "loss": 0.7696, "step": 6000 }, { "epoch": 1.15, "eval_loss": 0.8125277757644653, "eval_runtime": 16.506, "eval_samples_per_second": 121.168, "eval_steps_per_second": 1.939, "step": 6000 }, { "epoch": 1.15, "learning_rate": 0.00018585845600051424, "loss": 0.783, "step": 6020 }, { "epoch": 1.16, "learning_rate": 0.00018547277752780096, "loss": 0.7792, "step": 6040 }, { "epoch": 1.16, "learning_rate": 0.00018508709905508774, "loss": 0.7775, "step": 6060 }, { "epoch": 1.16, "learning_rate": 0.0001847014205823745, "loss": 0.7806, "step": 6080 }, { "epoch": 1.17, "learning_rate": 0.0001843157421096612, "loss": 0.7801, "step": 6100 }, { "epoch": 1.17, "learning_rate": 0.000183930063636948, "loss": 0.7853, "step": 6120 }, { "epoch": 1.18, "learning_rate": 0.0001835443851642347, "loss": 0.7937, "step": 6140 }, { "epoch": 1.18, "learning_rate": 0.0001831587066915215, "loss": 0.7873, "step": 6160 }, { "epoch": 1.18, "learning_rate": 0.00018277302821880824, "loss": 0.778, "step": 6180 }, { "epoch": 1.19, "learning_rate": 0.000182387349746095, "loss": 0.781, "step": 6200 }, { "epoch": 1.19, "eval_loss": 0.8113830089569092, "eval_runtime": 16.5217, "eval_samples_per_second": 121.053, "eval_steps_per_second": 1.937, "step": 6200 }, { "epoch": 1.19, "learning_rate": 0.00018200167127338174, "loss": 0.7746, "step": 6220 }, { "epoch": 1.2, "learning_rate": 0.00018161599280066851, "loss": 0.7752, "step": 6240 }, { "epoch": 1.2, "learning_rate": 0.00018123031432795524, "loss": 0.7838, "step": 6260 }, { "epoch": 1.2, "learning_rate": 0.000180844635855242, "loss": 0.789, "step": 6280 }, { "epoch": 1.21, "learning_rate": 0.00018045895738252874, "loss": 0.7882, "step": 6300 }, { "epoch": 1.21, "learning_rate": 0.00018007327890981549, "loss": 0.7822, "step": 6320 }, { "epoch": 1.21, "learning_rate": 0.00017968760043710226, "loss": 0.7889, "step": 6340 }, { "epoch": 1.22, "learning_rate": 0.00017930192196438899, "loss": 0.7891, "step": 6360 }, { "epoch": 1.22, "learning_rate": 0.00017891624349167576, "loss": 0.7884, "step": 6380 }, { "epoch": 1.23, "learning_rate": 0.0001785305650189625, "loss": 0.7733, "step": 6400 }, { "epoch": 1.23, "eval_loss": 0.810148298740387, "eval_runtime": 16.5113, "eval_samples_per_second": 121.129, "eval_steps_per_second": 1.938, "step": 6400 }, { "epoch": 1.23, "learning_rate": 0.00017814488654624926, "loss": 0.7794, "step": 6420 }, { "epoch": 1.23, "learning_rate": 0.000177759208073536, "loss": 0.775, "step": 6440 }, { "epoch": 1.24, "learning_rate": 0.00017737352960082276, "loss": 0.7706, "step": 6460 }, { "epoch": 1.24, "learning_rate": 0.0001769878511281095, "loss": 0.7808, "step": 6480 }, { "epoch": 1.25, "learning_rate": 0.0001766021726553963, "loss": 0.7805, "step": 6500 }, { "epoch": 1.25, "learning_rate": 0.000176216494182683, "loss": 0.7813, "step": 6520 }, { "epoch": 1.25, "learning_rate": 0.0001758308157099698, "loss": 0.7789, "step": 6540 }, { "epoch": 1.26, "learning_rate": 0.00017544513723725654, "loss": 0.7827, "step": 6560 }, { "epoch": 1.26, "learning_rate": 0.00017505945876454326, "loss": 0.7763, "step": 6580 }, { "epoch": 1.26, "learning_rate": 0.00017467378029183004, "loss": 0.7779, "step": 6600 }, { "epoch": 1.26, "eval_loss": 0.8090565800666809, "eval_runtime": 16.4954, "eval_samples_per_second": 121.246, "eval_steps_per_second": 1.94, "step": 6600 }, { "epoch": 1.27, "learning_rate": 0.00017428810181911676, "loss": 0.7793, "step": 6620 }, { "epoch": 1.27, "learning_rate": 0.00017390242334640354, "loss": 0.7778, "step": 6640 }, { "epoch": 1.28, "learning_rate": 0.00017351674487369029, "loss": 0.7802, "step": 6660 }, { "epoch": 1.28, "learning_rate": 0.00017313106640097704, "loss": 0.7823, "step": 6680 }, { "epoch": 1.28, "learning_rate": 0.00017274538792826379, "loss": 0.7868, "step": 6700 }, { "epoch": 1.29, "learning_rate": 0.00017235970945555056, "loss": 0.7824, "step": 6720 }, { "epoch": 1.29, "learning_rate": 0.00017197403098283728, "loss": 0.7777, "step": 6740 }, { "epoch": 1.3, "learning_rate": 0.00017158835251012406, "loss": 0.7822, "step": 6760 }, { "epoch": 1.3, "learning_rate": 0.00017120267403741078, "loss": 0.7798, "step": 6780 }, { "epoch": 1.3, "learning_rate": 0.00017081699556469753, "loss": 0.7712, "step": 6800 }, { "epoch": 1.3, "eval_loss": 0.8080956935882568, "eval_runtime": 16.5234, "eval_samples_per_second": 121.041, "eval_steps_per_second": 1.937, "step": 6800 }, { "epoch": 1.31, "learning_rate": 0.0001704313170919843, "loss": 0.7888, "step": 6820 }, { "epoch": 1.31, "learning_rate": 0.00017004563861927103, "loss": 0.7769, "step": 6840 }, { "epoch": 1.31, "learning_rate": 0.0001696599601465578, "loss": 0.7686, "step": 6860 }, { "epoch": 1.32, "learning_rate": 0.00016927428167384456, "loss": 0.7762, "step": 6880 }, { "epoch": 1.32, "learning_rate": 0.0001688886032011313, "loss": 0.7807, "step": 6900 }, { "epoch": 1.33, "learning_rate": 0.00016850292472841806, "loss": 0.7831, "step": 6920 }, { "epoch": 1.33, "learning_rate": 0.0001681172462557048, "loss": 0.7856, "step": 6940 }, { "epoch": 1.33, "learning_rate": 0.00016773156778299156, "loss": 0.775, "step": 6960 }, { "epoch": 1.34, "learning_rate": 0.00016734588931027834, "loss": 0.7835, "step": 6980 }, { "epoch": 1.34, "learning_rate": 0.00016696021083756506, "loss": 0.7756, "step": 7000 }, { "epoch": 1.34, "eval_loss": 0.8070209622383118, "eval_runtime": 16.4997, "eval_samples_per_second": 121.214, "eval_steps_per_second": 1.939, "step": 7000 }, { "epoch": 1.35, "learning_rate": 0.0001665745323648518, "loss": 0.7756, "step": 7020 }, { "epoch": 1.35, "learning_rate": 0.00016618885389213859, "loss": 0.7783, "step": 7040 }, { "epoch": 1.35, "learning_rate": 0.0001658031754194253, "loss": 0.7697, "step": 7060 }, { "epoch": 1.36, "learning_rate": 0.00016541749694671208, "loss": 0.7889, "step": 7080 }, { "epoch": 1.36, "learning_rate": 0.00016503181847399883, "loss": 0.7725, "step": 7100 }, { "epoch": 1.36, "learning_rate": 0.00016464614000128558, "loss": 0.7726, "step": 7120 }, { "epoch": 1.37, "learning_rate": 0.00016426046152857233, "loss": 0.7787, "step": 7140 }, { "epoch": 1.37, "learning_rate": 0.00016387478305585908, "loss": 0.782, "step": 7160 }, { "epoch": 1.38, "learning_rate": 0.00016348910458314583, "loss": 0.7736, "step": 7180 }, { "epoch": 1.38, "learning_rate": 0.0001631034261104326, "loss": 0.7748, "step": 7200 }, { "epoch": 1.38, "eval_loss": 0.8063712120056152, "eval_runtime": 16.5096, "eval_samples_per_second": 121.142, "eval_steps_per_second": 1.938, "step": 7200 }, { "epoch": 1.38, "learning_rate": 0.00016271774763771933, "loss": 0.7717, "step": 7220 }, { "epoch": 1.39, "learning_rate": 0.00016233206916500608, "loss": 0.7676, "step": 7240 }, { "epoch": 1.39, "learning_rate": 0.00016194639069229286, "loss": 0.7662, "step": 7260 }, { "epoch": 1.39, "learning_rate": 0.00016156071221957958, "loss": 0.7809, "step": 7280 }, { "epoch": 1.4, "learning_rate": 0.00016117503374686636, "loss": 0.7731, "step": 7300 }, { "epoch": 1.4, "learning_rate": 0.00016078935527415308, "loss": 0.7795, "step": 7320 }, { "epoch": 1.41, "learning_rate": 0.00016040367680143986, "loss": 0.78, "step": 7340 }, { "epoch": 1.41, "learning_rate": 0.0001600179983287266, "loss": 0.7785, "step": 7360 }, { "epoch": 1.41, "learning_rate": 0.00015963231985601336, "loss": 0.7694, "step": 7380 }, { "epoch": 1.42, "learning_rate": 0.0001592466413833001, "loss": 0.781, "step": 7400 }, { "epoch": 1.42, "eval_loss": 0.8048364520072937, "eval_runtime": 16.5235, "eval_samples_per_second": 121.04, "eval_steps_per_second": 1.937, "step": 7400 }, { "epoch": 1.42, "learning_rate": 0.00015886096291058688, "loss": 0.7681, "step": 7420 }, { "epoch": 1.43, "learning_rate": 0.0001584752844378736, "loss": 0.7835, "step": 7440 }, { "epoch": 1.43, "learning_rate": 0.00015808960596516038, "loss": 0.7778, "step": 7460 }, { "epoch": 1.43, "learning_rate": 0.0001577039274924471, "loss": 0.775, "step": 7480 }, { "epoch": 1.44, "learning_rate": 0.00015731824901973386, "loss": 0.7758, "step": 7500 }, { "epoch": 1.44, "learning_rate": 0.00015693257054702063, "loss": 0.7846, "step": 7520 }, { "epoch": 1.44, "learning_rate": 0.00015654689207430736, "loss": 0.7756, "step": 7540 }, { "epoch": 1.45, "learning_rate": 0.00015616121360159413, "loss": 0.7764, "step": 7560 }, { "epoch": 1.45, "learning_rate": 0.00015577553512888088, "loss": 0.7684, "step": 7580 }, { "epoch": 1.46, "learning_rate": 0.00015538985665616763, "loss": 0.7837, "step": 7600 }, { "epoch": 1.46, "eval_loss": 0.8041849136352539, "eval_runtime": 16.4633, "eval_samples_per_second": 121.482, "eval_steps_per_second": 1.944, "step": 7600 }, { "epoch": 1.46, "learning_rate": 0.00015500417818345438, "loss": 0.772, "step": 7620 }, { "epoch": 1.46, "learning_rate": 0.0001546184997107411, "loss": 0.7759, "step": 7640 }, { "epoch": 1.47, "learning_rate": 0.00015423282123802788, "loss": 0.7778, "step": 7660 }, { "epoch": 1.47, "learning_rate": 0.00015384714276531466, "loss": 0.78, "step": 7680 }, { "epoch": 1.48, "learning_rate": 0.00015346146429260138, "loss": 0.7681, "step": 7700 }, { "epoch": 1.48, "learning_rate": 0.00015307578581988813, "loss": 0.7731, "step": 7720 }, { "epoch": 1.48, "learning_rate": 0.0001526901073471749, "loss": 0.78, "step": 7740 }, { "epoch": 1.49, "learning_rate": 0.00015230442887446163, "loss": 0.7719, "step": 7760 }, { "epoch": 1.49, "learning_rate": 0.0001519187504017484, "loss": 0.7667, "step": 7780 }, { "epoch": 1.49, "learning_rate": 0.00015153307192903513, "loss": 0.7804, "step": 7800 }, { "epoch": 1.49, "eval_loss": 0.8034607768058777, "eval_runtime": 16.4833, "eval_samples_per_second": 121.335, "eval_steps_per_second": 1.941, "step": 7800 }, { "epoch": 1.5, "learning_rate": 0.0001511473934563219, "loss": 0.7813, "step": 7820 }, { "epoch": 1.5, "learning_rate": 0.00015076171498360866, "loss": 0.7751, "step": 7840 }, { "epoch": 1.51, "learning_rate": 0.00015037603651089538, "loss": 0.7681, "step": 7860 }, { "epoch": 1.51, "learning_rate": 0.00014999035803818216, "loss": 0.7679, "step": 7880 }, { "epoch": 1.51, "learning_rate": 0.0001496046795654689, "loss": 0.7723, "step": 7900 }, { "epoch": 1.52, "learning_rate": 0.00014921900109275566, "loss": 0.7732, "step": 7920 }, { "epoch": 1.52, "learning_rate": 0.0001488333226200424, "loss": 0.7805, "step": 7940 }, { "epoch": 1.53, "learning_rate": 0.00014844764414732916, "loss": 0.7666, "step": 7960 }, { "epoch": 1.53, "learning_rate": 0.0001480619656746159, "loss": 0.7801, "step": 7980 }, { "epoch": 1.53, "learning_rate": 0.00014767628720190265, "loss": 0.7736, "step": 8000 }, { "epoch": 1.53, "eval_loss": 0.8029702305793762, "eval_runtime": 16.5088, "eval_samples_per_second": 121.147, "eval_steps_per_second": 1.938, "step": 8000 }, { "epoch": 1.54, "learning_rate": 0.00014729060872918943, "loss": 0.7716, "step": 8020 }, { "epoch": 1.54, "learning_rate": 0.00014690493025647618, "loss": 0.7771, "step": 8040 }, { "epoch": 1.54, "learning_rate": 0.00014651925178376293, "loss": 0.7715, "step": 8060 }, { "epoch": 1.55, "learning_rate": 0.00014613357331104968, "loss": 0.7731, "step": 8080 }, { "epoch": 1.55, "learning_rate": 0.00014574789483833643, "loss": 0.7763, "step": 8100 }, { "epoch": 1.56, "learning_rate": 0.00014536221636562318, "loss": 0.7705, "step": 8120 }, { "epoch": 1.56, "learning_rate": 0.00014497653789290993, "loss": 0.7702, "step": 8140 }, { "epoch": 1.56, "learning_rate": 0.00014459085942019668, "loss": 0.7752, "step": 8160 }, { "epoch": 1.57, "learning_rate": 0.00014420518094748343, "loss": 0.7662, "step": 8180 }, { "epoch": 1.57, "learning_rate": 0.00014381950247477018, "loss": 0.7757, "step": 8200 }, { "epoch": 1.57, "eval_loss": 0.8025923371315002, "eval_runtime": 16.5398, "eval_samples_per_second": 120.921, "eval_steps_per_second": 1.935, "step": 8200 }, { "epoch": 1.58, "learning_rate": 0.00014343382400205693, "loss": 0.7638, "step": 8220 }, { "epoch": 1.58, "learning_rate": 0.0001430481455293437, "loss": 0.7836, "step": 8240 }, { "epoch": 1.58, "learning_rate": 0.00014266246705663046, "loss": 0.7685, "step": 8260 }, { "epoch": 1.59, "learning_rate": 0.0001422767885839172, "loss": 0.7901, "step": 8280 }, { "epoch": 1.59, "learning_rate": 0.00014189111011120396, "loss": 0.7729, "step": 8300 }, { "epoch": 1.59, "learning_rate": 0.0001415054316384907, "loss": 0.7614, "step": 8320 }, { "epoch": 1.6, "learning_rate": 0.00014111975316577745, "loss": 0.7789, "step": 8340 }, { "epoch": 1.6, "learning_rate": 0.0001407340746930642, "loss": 0.7713, "step": 8360 }, { "epoch": 1.61, "learning_rate": 0.00014034839622035095, "loss": 0.7831, "step": 8380 }, { "epoch": 1.61, "learning_rate": 0.0001399627177476377, "loss": 0.7674, "step": 8400 }, { "epoch": 1.61, "eval_loss": 0.801445722579956, "eval_runtime": 16.5305, "eval_samples_per_second": 120.989, "eval_steps_per_second": 1.936, "step": 8400 }, { "epoch": 1.61, "learning_rate": 0.00013957703927492445, "loss": 0.7698, "step": 8420 }, { "epoch": 1.62, "learning_rate": 0.0001391913608022112, "loss": 0.7725, "step": 8440 }, { "epoch": 1.62, "learning_rate": 0.00013880568232949795, "loss": 0.771, "step": 8460 }, { "epoch": 1.62, "learning_rate": 0.00013842000385678473, "loss": 0.7679, "step": 8480 }, { "epoch": 1.63, "learning_rate": 0.00013803432538407148, "loss": 0.7788, "step": 8500 }, { "epoch": 1.63, "learning_rate": 0.00013764864691135823, "loss": 0.7705, "step": 8520 }, { "epoch": 1.64, "learning_rate": 0.00013726296843864495, "loss": 0.7625, "step": 8540 }, { "epoch": 1.64, "learning_rate": 0.00013687728996593173, "loss": 0.7626, "step": 8560 }, { "epoch": 1.64, "learning_rate": 0.00013649161149321848, "loss": 0.7731, "step": 8580 }, { "epoch": 1.65, "learning_rate": 0.00013610593302050523, "loss": 0.7788, "step": 8600 }, { "epoch": 1.65, "eval_loss": 0.8010225296020508, "eval_runtime": 16.5075, "eval_samples_per_second": 121.157, "eval_steps_per_second": 1.939, "step": 8600 }, { "epoch": 1.65, "learning_rate": 0.00013572025454779198, "loss": 0.7758, "step": 8620 }, { "epoch": 1.66, "learning_rate": 0.00013533457607507873, "loss": 0.7738, "step": 8640 }, { "epoch": 1.66, "learning_rate": 0.00013494889760236548, "loss": 0.7827, "step": 8660 }, { "epoch": 1.66, "learning_rate": 0.00013456321912965223, "loss": 0.779, "step": 8680 }, { "epoch": 1.67, "learning_rate": 0.00013417754065693898, "loss": 0.771, "step": 8700 }, { "epoch": 1.67, "learning_rate": 0.00013379186218422575, "loss": 0.7683, "step": 8720 }, { "epoch": 1.67, "learning_rate": 0.0001334061837115125, "loss": 0.7728, "step": 8740 }, { "epoch": 1.68, "learning_rate": 0.00013302050523879925, "loss": 0.7761, "step": 8760 }, { "epoch": 1.68, "learning_rate": 0.00013263482676608598, "loss": 0.7705, "step": 8780 }, { "epoch": 1.69, "learning_rate": 0.00013224914829337275, "loss": 0.7624, "step": 8800 }, { "epoch": 1.69, "eval_loss": 0.8003928065299988, "eval_runtime": 16.5035, "eval_samples_per_second": 121.186, "eval_steps_per_second": 1.939, "step": 8800 }, { "epoch": 1.69, "learning_rate": 0.0001318634698206595, "loss": 0.7669, "step": 8820 }, { "epoch": 1.69, "learning_rate": 0.00013147779134794625, "loss": 0.7675, "step": 8840 }, { "epoch": 1.7, "learning_rate": 0.000131092112875233, "loss": 0.7629, "step": 8860 }, { "epoch": 1.7, "learning_rate": 0.00013070643440251975, "loss": 0.7663, "step": 8880 }, { "epoch": 1.71, "learning_rate": 0.0001303207559298065, "loss": 0.7708, "step": 8900 }, { "epoch": 1.71, "learning_rate": 0.00012993507745709325, "loss": 0.7734, "step": 8920 }, { "epoch": 1.71, "learning_rate": 0.00012954939898438, "loss": 0.7711, "step": 8940 }, { "epoch": 1.72, "learning_rate": 0.00012916372051166678, "loss": 0.769, "step": 8960 }, { "epoch": 1.72, "learning_rate": 0.00012877804203895353, "loss": 0.7706, "step": 8980 }, { "epoch": 1.72, "learning_rate": 0.00012839236356624025, "loss": 0.7752, "step": 9000 }, { "epoch": 1.72, "eval_loss": 0.799389660358429, "eval_runtime": 16.516, "eval_samples_per_second": 121.094, "eval_steps_per_second": 1.938, "step": 9000 }, { "epoch": 1.73, "learning_rate": 0.000128006685093527, "loss": 0.7678, "step": 9020 }, { "epoch": 1.73, "learning_rate": 0.00012762100662081378, "loss": 0.7764, "step": 9040 }, { "epoch": 1.74, "learning_rate": 0.00012723532814810053, "loss": 0.7672, "step": 9060 }, { "epoch": 1.74, "learning_rate": 0.00012684964967538728, "loss": 0.7705, "step": 9080 }, { "epoch": 1.74, "learning_rate": 0.00012646397120267403, "loss": 0.7657, "step": 9100 }, { "epoch": 1.75, "learning_rate": 0.00012607829272996078, "loss": 0.7648, "step": 9120 }, { "epoch": 1.75, "learning_rate": 0.00012569261425724753, "loss": 0.7737, "step": 9140 }, { "epoch": 1.76, "learning_rate": 0.00012530693578453428, "loss": 0.7628, "step": 9160 }, { "epoch": 1.76, "learning_rate": 0.00012492125731182103, "loss": 0.767, "step": 9180 }, { "epoch": 1.76, "learning_rate": 0.0001245355788391078, "loss": 0.764, "step": 9200 }, { "epoch": 1.76, "eval_loss": 0.7991757988929749, "eval_runtime": 16.5416, "eval_samples_per_second": 120.907, "eval_steps_per_second": 1.935, "step": 9200 }, { "epoch": 1.77, "learning_rate": 0.00012414990036639455, "loss": 0.7658, "step": 9220 }, { "epoch": 1.77, "learning_rate": 0.00012376422189368128, "loss": 0.7642, "step": 9240 }, { "epoch": 1.77, "learning_rate": 0.00012337854342096802, "loss": 0.7611, "step": 9260 }, { "epoch": 1.78, "learning_rate": 0.0001229928649482548, "loss": 0.7665, "step": 9280 }, { "epoch": 1.78, "learning_rate": 0.00012260718647554155, "loss": 0.7785, "step": 9300 }, { "epoch": 1.79, "learning_rate": 0.0001222215080028283, "loss": 0.7673, "step": 9320 }, { "epoch": 1.79, "learning_rate": 0.00012183582953011504, "loss": 0.777, "step": 9340 }, { "epoch": 1.79, "learning_rate": 0.0001214501510574018, "loss": 0.7684, "step": 9360 }, { "epoch": 1.8, "learning_rate": 0.00012106447258468855, "loss": 0.7694, "step": 9380 }, { "epoch": 1.8, "learning_rate": 0.0001206787941119753, "loss": 0.7634, "step": 9400 }, { "epoch": 1.8, "eval_loss": 0.7980849742889404, "eval_runtime": 16.5086, "eval_samples_per_second": 121.149, "eval_steps_per_second": 1.938, "step": 9400 }, { "epoch": 1.8, "learning_rate": 0.00012031239956289772, "loss": 0.7636, "step": 9420 }, { "epoch": 1.81, "learning_rate": 0.00011992672109018447, "loss": 0.7629, "step": 9440 }, { "epoch": 1.81, "learning_rate": 0.00011954104261747122, "loss": 0.7724, "step": 9460 }, { "epoch": 1.82, "learning_rate": 0.00011915536414475797, "loss": 0.7697, "step": 9480 }, { "epoch": 1.82, "learning_rate": 0.00011876968567204474, "loss": 0.7574, "step": 9500 }, { "epoch": 1.82, "learning_rate": 0.00011838400719933149, "loss": 0.7719, "step": 9520 }, { "epoch": 1.83, "learning_rate": 0.00011799832872661822, "loss": 0.7761, "step": 9540 }, { "epoch": 1.83, "learning_rate": 0.00011761265025390497, "loss": 0.7693, "step": 9560 }, { "epoch": 1.84, "learning_rate": 0.00011722697178119174, "loss": 0.7687, "step": 9580 }, { "epoch": 1.84, "learning_rate": 0.00011684129330847849, "loss": 0.7758, "step": 9600 }, { "epoch": 1.84, "eval_loss": 0.7981218099594116, "eval_runtime": 16.5407, "eval_samples_per_second": 120.914, "eval_steps_per_second": 1.935, "step": 9600 }, { "epoch": 1.84, "learning_rate": 0.00011645561483576524, "loss": 0.7603, "step": 9620 }, { "epoch": 1.85, "learning_rate": 0.00011606993636305199, "loss": 0.7579, "step": 9640 }, { "epoch": 1.85, "learning_rate": 0.00011568425789033875, "loss": 0.7673, "step": 9660 }, { "epoch": 1.85, "learning_rate": 0.0001152985794176255, "loss": 0.7745, "step": 9680 }, { "epoch": 1.86, "learning_rate": 0.00011491290094491225, "loss": 0.758, "step": 9700 }, { "epoch": 1.86, "learning_rate": 0.000114527222472199, "loss": 0.7686, "step": 9720 }, { "epoch": 1.87, "learning_rate": 0.00011414154399948576, "loss": 0.7741, "step": 9740 }, { "epoch": 1.87, "learning_rate": 0.00011375586552677251, "loss": 0.7646, "step": 9760 }, { "epoch": 1.87, "learning_rate": 0.00011337018705405925, "loss": 0.7675, "step": 9780 }, { "epoch": 1.88, "learning_rate": 0.000112984508581346, "loss": 0.7637, "step": 9800 }, { "epoch": 1.88, "eval_loss": 0.7970672845840454, "eval_runtime": 16.5386, "eval_samples_per_second": 120.929, "eval_steps_per_second": 1.935, "step": 9800 }, { "epoch": 1.88, "learning_rate": 0.00011259883010863276, "loss": 0.7678, "step": 9820 }, { "epoch": 1.89, "learning_rate": 0.00011221315163591951, "loss": 0.762, "step": 9840 }, { "epoch": 1.89, "learning_rate": 0.00011182747316320626, "loss": 0.7653, "step": 9860 }, { "epoch": 1.89, "learning_rate": 0.00011144179469049301, "loss": 0.7666, "step": 9880 }, { "epoch": 1.9, "learning_rate": 0.00011105611621777977, "loss": 0.7621, "step": 9900 }, { "epoch": 1.9, "learning_rate": 0.00011067043774506652, "loss": 0.7715, "step": 9920 }, { "epoch": 1.9, "learning_rate": 0.00011028475927235327, "loss": 0.7605, "step": 9940 }, { "epoch": 1.91, "learning_rate": 0.00010989908079964002, "loss": 0.7618, "step": 9960 }, { "epoch": 1.91, "learning_rate": 0.00010951340232692679, "loss": 0.7726, "step": 9980 }, { "epoch": 1.92, "learning_rate": 0.00010912772385421352, "loss": 0.7684, "step": 10000 }, { "epoch": 1.92, "eval_loss": 0.7967627048492432, "eval_runtime": 16.5033, "eval_samples_per_second": 121.188, "eval_steps_per_second": 1.939, "step": 10000 }, { "epoch": 1.92, "learning_rate": 0.00010874204538150027, "loss": 0.7666, "step": 10020 }, { "epoch": 1.92, "learning_rate": 0.00010835636690878702, "loss": 0.7661, "step": 10040 }, { "epoch": 1.93, "learning_rate": 0.00010797068843607378, "loss": 0.7621, "step": 10060 }, { "epoch": 1.93, "learning_rate": 0.00010758500996336053, "loss": 0.7736, "step": 10080 }, { "epoch": 1.94, "learning_rate": 0.00010719933149064728, "loss": 0.76, "step": 10100 }, { "epoch": 1.94, "learning_rate": 0.00010681365301793405, "loss": 0.764, "step": 10120 }, { "epoch": 1.94, "learning_rate": 0.0001064279745452208, "loss": 0.7697, "step": 10140 }, { "epoch": 1.95, "learning_rate": 0.00010604229607250755, "loss": 0.7602, "step": 10160 }, { "epoch": 1.95, "learning_rate": 0.0001056566175997943, "loss": 0.766, "step": 10180 }, { "epoch": 1.95, "learning_rate": 0.00010527093912708106, "loss": 0.7719, "step": 10200 }, { "epoch": 1.95, "eval_loss": 0.7964752912521362, "eval_runtime": 16.4947, "eval_samples_per_second": 121.251, "eval_steps_per_second": 1.94, "step": 10200 }, { "epoch": 1.96, "learning_rate": 0.00010488526065436781, "loss": 0.7653, "step": 10220 }, { "epoch": 1.96, "learning_rate": 0.00010449958218165455, "loss": 0.7653, "step": 10240 }, { "epoch": 1.97, "learning_rate": 0.0001041139037089413, "loss": 0.7711, "step": 10260 }, { "epoch": 1.97, "learning_rate": 0.00010372822523622806, "loss": 0.7729, "step": 10280 }, { "epoch": 1.97, "learning_rate": 0.00010334254676351481, "loss": 0.7709, "step": 10300 }, { "epoch": 1.98, "learning_rate": 0.00010295686829080156, "loss": 0.7611, "step": 10320 }, { "epoch": 1.98, "learning_rate": 0.00010257118981808831, "loss": 0.7607, "step": 10340 }, { "epoch": 1.99, "learning_rate": 0.00010218551134537507, "loss": 0.761, "step": 10360 }, { "epoch": 1.99, "learning_rate": 0.00010179983287266182, "loss": 0.7645, "step": 10380 }, { "epoch": 1.99, "learning_rate": 0.00010141415439994857, "loss": 0.7682, "step": 10400 }, { "epoch": 1.99, "eval_loss": 0.7955361008644104, "eval_runtime": 16.5066, "eval_samples_per_second": 121.164, "eval_steps_per_second": 1.939, "step": 10400 }, { "epoch": 2.0, "learning_rate": 0.00010102847592723531, "loss": 0.76, "step": 10420 }, { "epoch": 2.0, "learning_rate": 0.00010064279745452208, "loss": 0.7653, "step": 10440 }, { "epoch": 2.0, "learning_rate": 0.00010025711898180882, "loss": 0.7625, "step": 10460 }, { "epoch": 2.01, "learning_rate": 9.987144050909557e-05, "loss": 0.764, "step": 10480 }, { "epoch": 2.01, "learning_rate": 9.948576203638232e-05, "loss": 0.766, "step": 10500 }, { "epoch": 2.02, "learning_rate": 9.910008356366908e-05, "loss": 0.7656, "step": 10520 }, { "epoch": 2.02, "learning_rate": 9.871440509095583e-05, "loss": 0.7698, "step": 10540 }, { "epoch": 2.02, "learning_rate": 9.832872661824258e-05, "loss": 0.7635, "step": 10560 }, { "epoch": 2.03, "learning_rate": 9.794304814552933e-05, "loss": 0.77, "step": 10580 }, { "epoch": 2.03, "learning_rate": 9.75573696728161e-05, "loss": 0.7651, "step": 10600 }, { "epoch": 2.03, "eval_loss": 0.7953855395317078, "eval_runtime": 16.5084, "eval_samples_per_second": 121.15, "eval_steps_per_second": 1.938, "step": 10600 }, { "epoch": 2.03, "learning_rate": 9.717169120010285e-05, "loss": 0.7628, "step": 10620 }, { "epoch": 2.04, "learning_rate": 9.67860127273896e-05, "loss": 0.7662, "step": 10640 }, { "epoch": 2.04, "learning_rate": 9.640033425467633e-05, "loss": 0.7635, "step": 10660 }, { "epoch": 2.05, "learning_rate": 9.60146557819631e-05, "loss": 0.7601, "step": 10680 }, { "epoch": 2.05, "learning_rate": 9.562897730924984e-05, "loss": 0.7649, "step": 10700 }, { "epoch": 2.05, "learning_rate": 9.52432988365366e-05, "loss": 0.758, "step": 10720 }, { "epoch": 2.06, "learning_rate": 9.485762036382334e-05, "loss": 0.767, "step": 10740 }, { "epoch": 2.06, "learning_rate": 9.447194189111011e-05, "loss": 0.7559, "step": 10760 }, { "epoch": 2.07, "learning_rate": 9.408626341839686e-05, "loss": 0.765, "step": 10780 }, { "epoch": 2.07, "learning_rate": 9.37005849456836e-05, "loss": 0.7641, "step": 10800 }, { "epoch": 2.07, "eval_loss": 0.794941782951355, "eval_runtime": 16.5101, "eval_samples_per_second": 121.138, "eval_steps_per_second": 1.938, "step": 10800 }, { "epoch": 2.07, "learning_rate": 9.331490647297036e-05, "loss": 0.7691, "step": 10820 }, { "epoch": 2.08, "learning_rate": 9.292922800025712e-05, "loss": 0.7611, "step": 10840 }, { "epoch": 2.08, "learning_rate": 9.254354952754387e-05, "loss": 0.7609, "step": 10860 }, { "epoch": 2.08, "learning_rate": 9.21578710548306e-05, "loss": 0.758, "step": 10880 }, { "epoch": 2.09, "learning_rate": 9.177219258211736e-05, "loss": 0.7637, "step": 10900 }, { "epoch": 2.09, "learning_rate": 9.138651410940412e-05, "loss": 0.7645, "step": 10920 }, { "epoch": 2.1, "learning_rate": 9.100083563669087e-05, "loss": 0.7507, "step": 10940 }, { "epoch": 2.1, "learning_rate": 9.061515716397762e-05, "loss": 0.7673, "step": 10960 }, { "epoch": 2.1, "learning_rate": 9.022947869126437e-05, "loss": 0.7552, "step": 10980 }, { "epoch": 2.11, "learning_rate": 8.984380021855113e-05, "loss": 0.7639, "step": 11000 }, { "epoch": 2.11, "eval_loss": 0.7940524220466614, "eval_runtime": 16.5029, "eval_samples_per_second": 121.19, "eval_steps_per_second": 1.939, "step": 11000 }, { "epoch": 2.11, "learning_rate": 8.945812174583788e-05, "loss": 0.7719, "step": 11020 }, { "epoch": 2.12, "learning_rate": 8.907244327312463e-05, "loss": 0.7641, "step": 11040 }, { "epoch": 2.12, "learning_rate": 8.868676480041138e-05, "loss": 0.7614, "step": 11060 }, { "epoch": 2.12, "learning_rate": 8.830108632769814e-05, "loss": 0.7785, "step": 11080 }, { "epoch": 2.13, "learning_rate": 8.79154078549849e-05, "loss": 0.7756, "step": 11100 }, { "epoch": 2.13, "learning_rate": 8.752972938227163e-05, "loss": 0.7645, "step": 11120 }, { "epoch": 2.13, "learning_rate": 8.714405090955838e-05, "loss": 0.7621, "step": 11140 }, { "epoch": 2.14, "learning_rate": 8.675837243684514e-05, "loss": 0.7662, "step": 11160 }, { "epoch": 2.14, "learning_rate": 8.637269396413189e-05, "loss": 0.7617, "step": 11180 }, { "epoch": 2.15, "learning_rate": 8.598701549141864e-05, "loss": 0.7683, "step": 11200 }, { "epoch": 2.15, "eval_loss": 0.7937352061271667, "eval_runtime": 16.5052, "eval_samples_per_second": 121.174, "eval_steps_per_second": 1.939, "step": 11200 }, { "epoch": 2.15, "learning_rate": 8.560133701870539e-05, "loss": 0.7635, "step": 11220 }, { "epoch": 2.15, "learning_rate": 8.521565854599216e-05, "loss": 0.7622, "step": 11240 }, { "epoch": 2.16, "learning_rate": 8.48299800732789e-05, "loss": 0.7616, "step": 11260 }, { "epoch": 2.16, "learning_rate": 8.444430160056565e-05, "loss": 0.7558, "step": 11280 }, { "epoch": 2.17, "learning_rate": 8.40586231278524e-05, "loss": 0.7714, "step": 11300 }, { "epoch": 2.17, "learning_rate": 8.367294465513917e-05, "loss": 0.7676, "step": 11320 }, { "epoch": 2.17, "learning_rate": 8.32872661824259e-05, "loss": 0.7623, "step": 11340 }, { "epoch": 2.18, "learning_rate": 8.290158770971265e-05, "loss": 0.7608, "step": 11360 }, { "epoch": 2.18, "learning_rate": 8.251590923699942e-05, "loss": 0.7746, "step": 11380 }, { "epoch": 2.18, "learning_rate": 8.213023076428617e-05, "loss": 0.7684, "step": 11400 }, { "epoch": 2.18, "eval_loss": 0.7929428219795227, "eval_runtime": 16.7561, "eval_samples_per_second": 119.359, "eval_steps_per_second": 1.91, "step": 11400 }, { "epoch": 2.19, "learning_rate": 8.174455229157292e-05, "loss": 0.7628, "step": 11420 }, { "epoch": 2.19, "learning_rate": 8.135887381885967e-05, "loss": 0.7614, "step": 11440 }, { "epoch": 2.2, "learning_rate": 8.099247926978209e-05, "loss": 0.7616, "step": 11460 }, { "epoch": 2.2, "learning_rate": 8.060680079706884e-05, "loss": 0.7614, "step": 11480 }, { "epoch": 2.2, "learning_rate": 8.022112232435559e-05, "loss": 0.7684, "step": 11500 }, { "epoch": 2.21, "learning_rate": 7.983544385164233e-05, "loss": 0.7663, "step": 11520 }, { "epoch": 2.21, "learning_rate": 7.94497653789291e-05, "loss": 0.7621, "step": 11540 }, { "epoch": 2.21, "learning_rate": 7.906408690621584e-05, "loss": 0.77, "step": 11560 }, { "epoch": 2.22, "learning_rate": 7.867840843350259e-05, "loss": 0.7629, "step": 11580 }, { "epoch": 2.22, "learning_rate": 7.829272996078934e-05, "loss": 0.7592, "step": 11600 }, { "epoch": 2.22, "eval_loss": 0.7931132316589355, "eval_runtime": 16.4886, "eval_samples_per_second": 121.296, "eval_steps_per_second": 1.941, "step": 11600 }, { "epoch": 2.23, "learning_rate": 7.79070514880761e-05, "loss": 0.7593, "step": 11620 }, { "epoch": 2.23, "learning_rate": 7.752137301536285e-05, "loss": 0.7579, "step": 11640 }, { "epoch": 2.23, "learning_rate": 7.71356945426496e-05, "loss": 0.7666, "step": 11660 }, { "epoch": 2.24, "learning_rate": 7.675001606993635e-05, "loss": 0.7573, "step": 11680 }, { "epoch": 2.24, "learning_rate": 7.636433759722312e-05, "loss": 0.7654, "step": 11700 }, { "epoch": 2.25, "learning_rate": 7.597865912450986e-05, "loss": 0.7637, "step": 11720 }, { "epoch": 2.25, "learning_rate": 7.559298065179661e-05, "loss": 0.7638, "step": 11740 }, { "epoch": 2.25, "learning_rate": 7.520730217908335e-05, "loss": 0.7538, "step": 11760 }, { "epoch": 2.26, "learning_rate": 7.482162370637011e-05, "loss": 0.7598, "step": 11780 }, { "epoch": 2.26, "learning_rate": 7.443594523365686e-05, "loss": 0.7577, "step": 11800 }, { "epoch": 2.26, "eval_loss": 0.7928204536437988, "eval_runtime": 16.5434, "eval_samples_per_second": 120.894, "eval_steps_per_second": 1.934, "step": 11800 }, { "epoch": 2.26, "learning_rate": 7.405026676094361e-05, "loss": 0.7561, "step": 11820 }, { "epoch": 2.27, "learning_rate": 7.366458828823038e-05, "loss": 0.7557, "step": 11840 }, { "epoch": 2.27, "learning_rate": 7.327890981551713e-05, "loss": 0.7606, "step": 11860 }, { "epoch": 2.28, "learning_rate": 7.289323134280388e-05, "loss": 0.7575, "step": 11880 }, { "epoch": 2.28, "learning_rate": 7.250755287009063e-05, "loss": 0.7557, "step": 11900 }, { "epoch": 2.28, "learning_rate": 7.212187439737738e-05, "loss": 0.7687, "step": 11920 }, { "epoch": 2.29, "learning_rate": 7.173619592466414e-05, "loss": 0.7647, "step": 11940 }, { "epoch": 2.29, "learning_rate": 7.135051745195089e-05, "loss": 0.7608, "step": 11960 }, { "epoch": 2.3, "learning_rate": 7.096483897923764e-05, "loss": 0.7624, "step": 11980 }, { "epoch": 2.3, "learning_rate": 7.057916050652439e-05, "loss": 0.7651, "step": 12000 }, { "epoch": 2.3, "eval_loss": 0.7917994856834412, "eval_runtime": 16.5312, "eval_samples_per_second": 120.983, "eval_steps_per_second": 1.936, "step": 12000 }, { "epoch": 2.3, "learning_rate": 7.019348203381114e-05, "loss": 0.7678, "step": 12020 }, { "epoch": 2.31, "learning_rate": 6.980780356109789e-05, "loss": 0.7606, "step": 12040 }, { "epoch": 2.31, "learning_rate": 6.942212508838465e-05, "loss": 0.7607, "step": 12060 }, { "epoch": 2.31, "learning_rate": 6.90364466156714e-05, "loss": 0.763, "step": 12080 }, { "epoch": 2.32, "learning_rate": 6.865076814295815e-05, "loss": 0.7669, "step": 12100 }, { "epoch": 2.32, "learning_rate": 6.82650896702449e-05, "loss": 0.755, "step": 12120 }, { "epoch": 2.33, "learning_rate": 6.787941119753165e-05, "loss": 0.7611, "step": 12140 }, { "epoch": 2.33, "learning_rate": 6.74937327248184e-05, "loss": 0.7576, "step": 12160 }, { "epoch": 2.33, "learning_rate": 6.710805425210516e-05, "loss": 0.7581, "step": 12180 }, { "epoch": 2.34, "learning_rate": 6.672237577939191e-05, "loss": 0.7647, "step": 12200 }, { "epoch": 2.34, "eval_loss": 0.7914180755615234, "eval_runtime": 16.5021, "eval_samples_per_second": 121.196, "eval_steps_per_second": 1.939, "step": 12200 }, { "epoch": 2.34, "learning_rate": 6.633669730667866e-05, "loss": 0.7582, "step": 12220 }, { "epoch": 2.35, "learning_rate": 6.595101883396541e-05, "loss": 0.7531, "step": 12240 }, { "epoch": 2.35, "learning_rate": 6.556534036125216e-05, "loss": 0.7526, "step": 12260 }, { "epoch": 2.35, "learning_rate": 6.517966188853891e-05, "loss": 0.7701, "step": 12280 }, { "epoch": 2.36, "learning_rate": 6.479398341582568e-05, "loss": 0.7662, "step": 12300 }, { "epoch": 2.36, "learning_rate": 6.440830494311241e-05, "loss": 0.7541, "step": 12320 }, { "epoch": 2.36, "learning_rate": 6.402262647039918e-05, "loss": 0.7578, "step": 12340 }, { "epoch": 2.37, "learning_rate": 6.363694799768592e-05, "loss": 0.7569, "step": 12360 }, { "epoch": 2.37, "learning_rate": 6.325126952497267e-05, "loss": 0.7635, "step": 12380 }, { "epoch": 2.38, "learning_rate": 6.286559105225942e-05, "loss": 0.7618, "step": 12400 }, { "epoch": 2.38, "eval_loss": 0.7912635207176208, "eval_runtime": 16.4984, "eval_samples_per_second": 121.224, "eval_steps_per_second": 1.94, "step": 12400 }, { "epoch": 2.38, "learning_rate": 6.247991257954619e-05, "loss": 0.7536, "step": 12420 }, { "epoch": 2.38, "learning_rate": 6.209423410683292e-05, "loss": 0.7478, "step": 12440 }, { "epoch": 2.39, "learning_rate": 6.170855563411969e-05, "loss": 0.745, "step": 12460 }, { "epoch": 2.39, "learning_rate": 6.132287716140644e-05, "loss": 0.7611, "step": 12480 }, { "epoch": 2.4, "learning_rate": 6.0937198688693187e-05, "loss": 0.763, "step": 12500 }, { "epoch": 2.4, "learning_rate": 6.0551520215979936e-05, "loss": 0.7647, "step": 12520 }, { "epoch": 2.4, "learning_rate": 6.016584174326669e-05, "loss": 0.7621, "step": 12540 }, { "epoch": 2.41, "learning_rate": 5.978016327055344e-05, "loss": 0.7568, "step": 12560 }, { "epoch": 2.41, "learning_rate": 5.93944847978402e-05, "loss": 0.7613, "step": 12580 }, { "epoch": 2.41, "learning_rate": 5.900880632512694e-05, "loss": 0.7568, "step": 12600 }, { "epoch": 2.41, "eval_loss": 0.7910023331642151, "eval_runtime": 16.5022, "eval_samples_per_second": 121.196, "eval_steps_per_second": 1.939, "step": 12600 }, { "epoch": 2.42, "learning_rate": 5.86231278524137e-05, "loss": 0.7636, "step": 12620 }, { "epoch": 2.42, "learning_rate": 5.823744937970045e-05, "loss": 0.7657, "step": 12640 }, { "epoch": 2.43, "learning_rate": 5.7851770906987205e-05, "loss": 0.7703, "step": 12660 }, { "epoch": 2.43, "learning_rate": 5.7466092434273955e-05, "loss": 0.7557, "step": 12680 }, { "epoch": 2.43, "learning_rate": 5.708041396156071e-05, "loss": 0.7667, "step": 12700 }, { "epoch": 2.44, "learning_rate": 5.6694735488847454e-05, "loss": 0.7605, "step": 12720 }, { "epoch": 2.44, "learning_rate": 5.630905701613421e-05, "loss": 0.7549, "step": 12740 }, { "epoch": 2.44, "learning_rate": 5.592337854342096e-05, "loss": 0.7592, "step": 12760 }, { "epoch": 2.45, "learning_rate": 5.553770007070772e-05, "loss": 0.7654, "step": 12780 }, { "epoch": 2.45, "learning_rate": 5.5171305521630135e-05, "loss": 0.7636, "step": 12800 }, { "epoch": 2.45, "eval_loss": 0.7906058430671692, "eval_runtime": 16.5012, "eval_samples_per_second": 121.203, "eval_steps_per_second": 1.939, "step": 12800 }, { "epoch": 2.46, "learning_rate": 5.478562704891688e-05, "loss": 0.7629, "step": 12820 }, { "epoch": 2.46, "learning_rate": 5.4399948576203634e-05, "loss": 0.7688, "step": 12840 }, { "epoch": 2.46, "learning_rate": 5.4014270103490384e-05, "loss": 0.7534, "step": 12860 }, { "epoch": 2.47, "learning_rate": 5.362859163077714e-05, "loss": 0.76, "step": 12880 }, { "epoch": 2.47, "learning_rate": 5.324291315806389e-05, "loss": 0.7561, "step": 12900 }, { "epoch": 2.48, "learning_rate": 5.285723468535065e-05, "loss": 0.7562, "step": 12920 }, { "epoch": 2.48, "learning_rate": 5.247155621263739e-05, "loss": 0.7607, "step": 12940 }, { "epoch": 2.48, "learning_rate": 5.2085877739924146e-05, "loss": 0.7612, "step": 12960 }, { "epoch": 2.49, "learning_rate": 5.1700199267210896e-05, "loss": 0.7643, "step": 12980 }, { "epoch": 2.49, "learning_rate": 5.131452079449765e-05, "loss": 0.7656, "step": 13000 }, { "epoch": 2.49, "eval_loss": 0.790121853351593, "eval_runtime": 16.5158, "eval_samples_per_second": 121.096, "eval_steps_per_second": 1.938, "step": 13000 }, { "epoch": 2.49, "learning_rate": 5.09288423217844e-05, "loss": 0.756, "step": 13020 }, { "epoch": 2.5, "learning_rate": 5.054316384907115e-05, "loss": 0.7597, "step": 13040 }, { "epoch": 2.5, "learning_rate": 5.01574853763579e-05, "loss": 0.7525, "step": 13060 }, { "epoch": 2.51, "learning_rate": 4.977180690364466e-05, "loss": 0.7565, "step": 13080 }, { "epoch": 2.51, "learning_rate": 4.938612843093141e-05, "loss": 0.7631, "step": 13100 }, { "epoch": 2.51, "learning_rate": 4.9000449958218165e-05, "loss": 0.7514, "step": 13120 }, { "epoch": 2.52, "learning_rate": 4.861477148550491e-05, "loss": 0.7576, "step": 13140 }, { "epoch": 2.52, "learning_rate": 4.8229093012791664e-05, "loss": 0.7539, "step": 13160 }, { "epoch": 2.53, "learning_rate": 4.7843414540078414e-05, "loss": 0.7586, "step": 13180 }, { "epoch": 2.53, "learning_rate": 4.745773606736517e-05, "loss": 0.7573, "step": 13200 }, { "epoch": 2.53, "eval_loss": 0.7899668216705322, "eval_runtime": 16.509, "eval_samples_per_second": 121.146, "eval_steps_per_second": 1.938, "step": 13200 }, { "epoch": 2.53, "learning_rate": 4.707205759465192e-05, "loss": 0.7671, "step": 13220 }, { "epoch": 2.54, "learning_rate": 4.668637912193868e-05, "loss": 0.758, "step": 13240 }, { "epoch": 2.54, "learning_rate": 4.630070064922542e-05, "loss": 0.7444, "step": 13260 }, { "epoch": 2.54, "learning_rate": 4.5915022176512176e-05, "loss": 0.7548, "step": 13280 }, { "epoch": 2.55, "learning_rate": 4.5529343703798926e-05, "loss": 0.7681, "step": 13300 }, { "epoch": 2.55, "learning_rate": 4.514366523108568e-05, "loss": 0.7599, "step": 13320 }, { "epoch": 2.56, "learning_rate": 4.475798675837243e-05, "loss": 0.7631, "step": 13340 }, { "epoch": 2.56, "learning_rate": 4.437230828565919e-05, "loss": 0.7565, "step": 13360 }, { "epoch": 2.56, "learning_rate": 4.398662981294593e-05, "loss": 0.7586, "step": 13380 }, { "epoch": 2.57, "learning_rate": 4.360095134023269e-05, "loss": 0.7526, "step": 13400 }, { "epoch": 2.57, "eval_loss": 0.7896500825881958, "eval_runtime": 16.5086, "eval_samples_per_second": 121.149, "eval_steps_per_second": 1.938, "step": 13400 }, { "epoch": 2.57, "learning_rate": 4.321527286751944e-05, "loss": 0.7591, "step": 13420 }, { "epoch": 2.58, "learning_rate": 4.2829594394806195e-05, "loss": 0.7645, "step": 13440 }, { "epoch": 2.58, "learning_rate": 4.2443915922092944e-05, "loss": 0.7532, "step": 13460 }, { "epoch": 2.58, "learning_rate": 4.2077521373015355e-05, "loss": 0.746, "step": 13480 }, { "epoch": 2.59, "learning_rate": 4.169184290030211e-05, "loss": 0.7534, "step": 13500 }, { "epoch": 2.59, "learning_rate": 4.130616442758886e-05, "loss": 0.7506, "step": 13520 }, { "epoch": 2.59, "learning_rate": 4.092048595487562e-05, "loss": 0.7535, "step": 13540 }, { "epoch": 2.6, "learning_rate": 4.053480748216237e-05, "loss": 0.7596, "step": 13560 }, { "epoch": 2.6, "learning_rate": 4.0149129009449124e-05, "loss": 0.7686, "step": 13580 }, { "epoch": 2.61, "learning_rate": 3.976345053673587e-05, "loss": 0.7537, "step": 13600 }, { "epoch": 2.61, "eval_loss": 0.7891342639923096, "eval_runtime": 16.5163, "eval_samples_per_second": 121.092, "eval_steps_per_second": 1.937, "step": 13600 }, { "epoch": 2.61, "learning_rate": 3.9377772064022624e-05, "loss": 0.7656, "step": 13620 }, { "epoch": 2.61, "learning_rate": 3.8992093591309374e-05, "loss": 0.7515, "step": 13640 }, { "epoch": 2.62, "learning_rate": 3.860641511859613e-05, "loss": 0.761, "step": 13660 }, { "epoch": 2.62, "learning_rate": 3.822073664588288e-05, "loss": 0.7648, "step": 13680 }, { "epoch": 2.63, "learning_rate": 3.783505817316963e-05, "loss": 0.7671, "step": 13700 }, { "epoch": 2.63, "learning_rate": 3.7449379700456386e-05, "loss": 0.7653, "step": 13720 }, { "epoch": 2.63, "learning_rate": 3.7063701227743136e-05, "loss": 0.7583, "step": 13740 }, { "epoch": 2.64, "learning_rate": 3.6678022755029886e-05, "loss": 0.7602, "step": 13760 }, { "epoch": 2.64, "learning_rate": 3.629234428231664e-05, "loss": 0.7626, "step": 13780 }, { "epoch": 2.64, "learning_rate": 3.590666580960339e-05, "loss": 0.7485, "step": 13800 }, { "epoch": 2.64, "eval_loss": 0.7891269326210022, "eval_runtime": 16.5041, "eval_samples_per_second": 121.182, "eval_steps_per_second": 1.939, "step": 13800 }, { "epoch": 2.65, "learning_rate": 3.552098733689014e-05, "loss": 0.7564, "step": 13820 }, { "epoch": 2.65, "learning_rate": 3.51353088641769e-05, "loss": 0.7603, "step": 13840 }, { "epoch": 2.66, "learning_rate": 3.474963039146365e-05, "loss": 0.7584, "step": 13860 }, { "epoch": 2.66, "learning_rate": 3.43639519187504e-05, "loss": 0.7608, "step": 13880 }, { "epoch": 2.66, "learning_rate": 3.3978273446037154e-05, "loss": 0.7535, "step": 13900 }, { "epoch": 2.67, "learning_rate": 3.3592594973323904e-05, "loss": 0.7614, "step": 13920 }, { "epoch": 2.67, "learning_rate": 3.3206916500610654e-05, "loss": 0.7654, "step": 13940 }, { "epoch": 2.67, "learning_rate": 3.282123802789741e-05, "loss": 0.7656, "step": 13960 }, { "epoch": 2.68, "learning_rate": 3.243555955518416e-05, "loss": 0.756, "step": 13980 }, { "epoch": 2.68, "learning_rate": 3.204988108247091e-05, "loss": 0.7653, "step": 14000 }, { "epoch": 2.68, "eval_loss": 0.7888805866241455, "eval_runtime": 16.5275, "eval_samples_per_second": 121.01, "eval_steps_per_second": 1.936, "step": 14000 }, { "epoch": 2.69, "learning_rate": 3.1664202609757666e-05, "loss": 0.7618, "step": 14020 }, { "epoch": 2.69, "learning_rate": 3.1278524137044416e-05, "loss": 0.7588, "step": 14040 }, { "epoch": 2.69, "learning_rate": 3.0892845664331166e-05, "loss": 0.7625, "step": 14060 }, { "epoch": 2.7, "learning_rate": 3.050716719161792e-05, "loss": 0.7532, "step": 14080 }, { "epoch": 2.7, "learning_rate": 3.0121488718904672e-05, "loss": 0.7538, "step": 14100 }, { "epoch": 2.71, "learning_rate": 2.9735810246191422e-05, "loss": 0.7531, "step": 14120 }, { "epoch": 2.71, "learning_rate": 2.9350131773478175e-05, "loss": 0.7551, "step": 14140 }, { "epoch": 2.71, "learning_rate": 2.8964453300764928e-05, "loss": 0.7479, "step": 14160 }, { "epoch": 2.72, "learning_rate": 2.8578774828051678e-05, "loss": 0.7629, "step": 14180 }, { "epoch": 2.72, "learning_rate": 2.819309635533843e-05, "loss": 0.7572, "step": 14200 }, { "epoch": 2.72, "eval_loss": 0.7884878516197205, "eval_runtime": 16.7595, "eval_samples_per_second": 119.335, "eval_steps_per_second": 1.909, "step": 14200 }, { "epoch": 2.72, "learning_rate": 2.7807417882625184e-05, "loss": 0.758, "step": 14220 }, { "epoch": 2.73, "learning_rate": 2.7421739409911934e-05, "loss": 0.7608, "step": 14240 }, { "epoch": 2.73, "learning_rate": 2.7036060937198687e-05, "loss": 0.7555, "step": 14260 }, { "epoch": 2.74, "learning_rate": 2.6650382464485437e-05, "loss": 0.7512, "step": 14280 }, { "epoch": 2.74, "learning_rate": 2.626470399177219e-05, "loss": 0.7488, "step": 14300 }, { "epoch": 2.74, "learning_rate": 2.5879025519058943e-05, "loss": 0.7532, "step": 14320 }, { "epoch": 2.75, "learning_rate": 2.5493347046345693e-05, "loss": 0.7525, "step": 14340 }, { "epoch": 2.75, "learning_rate": 2.5107668573632446e-05, "loss": 0.7662, "step": 14360 }, { "epoch": 2.76, "learning_rate": 2.47219901009192e-05, "loss": 0.7583, "step": 14380 }, { "epoch": 2.76, "learning_rate": 2.433631162820595e-05, "loss": 0.7442, "step": 14400 }, { "epoch": 2.76, "eval_loss": 0.7883238196372986, "eval_runtime": 16.474, "eval_samples_per_second": 121.403, "eval_steps_per_second": 1.942, "step": 14400 }, { "epoch": 2.76, "learning_rate": 2.3950633155492702e-05, "loss": 0.7612, "step": 14420 }, { "epoch": 2.77, "learning_rate": 2.3564954682779455e-05, "loss": 0.7571, "step": 14440 }, { "epoch": 2.77, "learning_rate": 2.3179276210066205e-05, "loss": 0.7511, "step": 14460 }, { "epoch": 2.77, "learning_rate": 2.2793597737352958e-05, "loss": 0.7567, "step": 14480 }, { "epoch": 2.78, "learning_rate": 2.2407919264639708e-05, "loss": 0.7555, "step": 14500 }, { "epoch": 2.78, "learning_rate": 2.202224079192646e-05, "loss": 0.7555, "step": 14520 }, { "epoch": 2.79, "learning_rate": 2.1636562319213214e-05, "loss": 0.7509, "step": 14540 }, { "epoch": 2.79, "learning_rate": 2.1250883846499964e-05, "loss": 0.7585, "step": 14560 }, { "epoch": 2.79, "learning_rate": 2.0865205373786717e-05, "loss": 0.7621, "step": 14580 }, { "epoch": 2.8, "learning_rate": 2.047952690107347e-05, "loss": 0.7601, "step": 14600 }, { "epoch": 2.8, "eval_loss": 0.7880419492721558, "eval_runtime": 16.6163, "eval_samples_per_second": 120.364, "eval_steps_per_second": 1.926, "step": 14600 }, { "epoch": 2.8, "learning_rate": 2.009384842836022e-05, "loss": 0.7574, "step": 14620 }, { "epoch": 2.81, "learning_rate": 1.9708169955646973e-05, "loss": 0.7538, "step": 14640 }, { "epoch": 2.81, "learning_rate": 1.9322491482933726e-05, "loss": 0.7611, "step": 14660 }, { "epoch": 2.81, "learning_rate": 1.8936813010220476e-05, "loss": 0.7519, "step": 14680 }, { "epoch": 2.82, "learning_rate": 1.855113453750723e-05, "loss": 0.7559, "step": 14700 }, { "epoch": 2.82, "learning_rate": 1.8165456064793982e-05, "loss": 0.7596, "step": 14720 }, { "epoch": 2.82, "learning_rate": 1.7779777592080735e-05, "loss": 0.7564, "step": 14740 }, { "epoch": 2.83, "learning_rate": 1.7394099119367485e-05, "loss": 0.7526, "step": 14760 }, { "epoch": 2.83, "learning_rate": 1.7008420646654238e-05, "loss": 0.7624, "step": 14780 }, { "epoch": 2.84, "learning_rate": 1.662274217394099e-05, "loss": 0.7569, "step": 14800 }, { "epoch": 2.84, "eval_loss": 0.7879504561424255, "eval_runtime": 16.5411, "eval_samples_per_second": 120.911, "eval_steps_per_second": 1.935, "step": 14800 }, { "epoch": 2.84, "learning_rate": 1.623706370122774e-05, "loss": 0.7543, "step": 14820 }, { "epoch": 2.84, "learning_rate": 1.5851385228514494e-05, "loss": 0.7533, "step": 14840 }, { "epoch": 2.85, "learning_rate": 1.5465706755801247e-05, "loss": 0.7579, "step": 14860 }, { "epoch": 2.85, "learning_rate": 1.5080028283087997e-05, "loss": 0.7638, "step": 14880 }, { "epoch": 2.85, "learning_rate": 1.469434981037475e-05, "loss": 0.7456, "step": 14900 }, { "epoch": 2.86, "learning_rate": 1.4308671337661502e-05, "loss": 0.7561, "step": 14920 }, { "epoch": 2.86, "learning_rate": 1.3922992864948253e-05, "loss": 0.7626, "step": 14940 }, { "epoch": 2.87, "learning_rate": 1.3537314392235005e-05, "loss": 0.7686, "step": 14960 }, { "epoch": 2.87, "learning_rate": 1.3151635919521758e-05, "loss": 0.7512, "step": 14980 }, { "epoch": 2.87, "learning_rate": 1.276595744680851e-05, "loss": 0.7526, "step": 15000 }, { "epoch": 2.87, "eval_loss": 0.7875809073448181, "eval_runtime": 16.5086, "eval_samples_per_second": 121.149, "eval_steps_per_second": 1.938, "step": 15000 }, { "epoch": 2.88, "learning_rate": 1.238027897409526e-05, "loss": 0.7582, "step": 15020 }, { "epoch": 2.88, "learning_rate": 1.1994600501382012e-05, "loss": 0.7584, "step": 15040 }, { "epoch": 2.89, "learning_rate": 1.1608922028668765e-05, "loss": 0.7643, "step": 15060 }, { "epoch": 2.89, "learning_rate": 1.1223243555955517e-05, "loss": 0.7693, "step": 15080 }, { "epoch": 2.89, "learning_rate": 1.0837565083242268e-05, "loss": 0.7591, "step": 15100 }, { "epoch": 2.9, "learning_rate": 1.0451886610529021e-05, "loss": 0.7482, "step": 15120 }, { "epoch": 2.9, "learning_rate": 1.0066208137815773e-05, "loss": 0.7553, "step": 15140 }, { "epoch": 2.9, "learning_rate": 9.680529665102524e-06, "loss": 0.7563, "step": 15160 }, { "epoch": 2.91, "learning_rate": 9.294851192389277e-06, "loss": 0.7639, "step": 15180 }, { "epoch": 2.91, "learning_rate": 8.909172719676029e-06, "loss": 0.7577, "step": 15200 }, { "epoch": 2.91, "eval_loss": 0.7872186303138733, "eval_runtime": 16.5027, "eval_samples_per_second": 121.192, "eval_steps_per_second": 1.939, "step": 15200 } ], "max_steps": 15657, "num_train_epochs": 3, "total_flos": 3.951639155229852e+19, "trial_name": null, "trial_params": null }