{ "best_metric": 0.7255927324295044, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle-7b/checkpoint-12200", "epoch": 2.93204859161174, "global_step": 12400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.8908, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011999999999999999, "loss": 1.5545, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017999999999999998, "loss": 1.1252, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.00023999999999999998, "loss": 1.054, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.0003, "loss": 1.0137, "step": 100 }, { "epoch": 0.03, "learning_rate": 0.0002995233177087471, "loss": 1.0046, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.0002990466354174942, "loss": 0.9867, "step": 140 }, { "epoch": 0.04, "learning_rate": 0.00029856995312624134, "loss": 0.9612, "step": 160 }, { "epoch": 0.04, "learning_rate": 0.0002980932708349884, "loss": 0.9588, "step": 180 }, { "epoch": 0.05, "learning_rate": 0.0002976165885437356, "loss": 0.9551, "step": 200 }, { "epoch": 0.05, "eval_loss": 0.9459459185600281, "eval_runtime": 19.4211, "eval_samples_per_second": 102.981, "eval_steps_per_second": 3.244, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.0002971399062524827, "loss": 0.9516, "step": 220 }, { "epoch": 0.06, "learning_rate": 0.00029666322396122984, "loss": 0.937, "step": 240 }, { "epoch": 0.06, "learning_rate": 0.0002961865416699769, "loss": 0.936, "step": 260 }, { "epoch": 0.07, "learning_rate": 0.00029570985937872406, "loss": 0.9305, "step": 280 }, { "epoch": 0.07, "learning_rate": 0.00029523317708747115, "loss": 0.9146, "step": 300 }, { "epoch": 0.08, "learning_rate": 0.0002947564947962183, "loss": 0.9226, "step": 320 }, { "epoch": 0.08, "learning_rate": 0.00029427981250496543, "loss": 0.9108, "step": 340 }, { "epoch": 0.09, "learning_rate": 0.0002938031302137125, "loss": 0.9129, "step": 360 }, { "epoch": 0.09, "learning_rate": 0.00029332644792245965, "loss": 0.9063, "step": 380 }, { "epoch": 0.09, "learning_rate": 0.0002928497656312068, "loss": 0.8996, "step": 400 }, { "epoch": 0.09, "eval_loss": 0.9003962874412537, "eval_runtime": 19.5716, "eval_samples_per_second": 102.189, "eval_steps_per_second": 3.219, "step": 400 }, { "epoch": 0.1, "learning_rate": 0.00029237308333995393, "loss": 0.898, "step": 420 }, { "epoch": 0.1, "learning_rate": 0.000291896401048701, "loss": 0.8936, "step": 440 }, { "epoch": 0.11, "learning_rate": 0.00029141971875744815, "loss": 0.8932, "step": 460 }, { "epoch": 0.11, "learning_rate": 0.00029094303646619524, "loss": 0.8779, "step": 480 }, { "epoch": 0.12, "learning_rate": 0.0002904663541749424, "loss": 0.8871, "step": 500 }, { "epoch": 0.12, "learning_rate": 0.00028998967188368946, "loss": 0.8929, "step": 520 }, { "epoch": 0.13, "learning_rate": 0.0002895129895924366, "loss": 0.8878, "step": 540 }, { "epoch": 0.13, "learning_rate": 0.00028903630730118374, "loss": 0.8818, "step": 560 }, { "epoch": 0.14, "learning_rate": 0.00028855962500993083, "loss": 0.8826, "step": 580 }, { "epoch": 0.14, "learning_rate": 0.00028808294271867797, "loss": 0.879, "step": 600 }, { "epoch": 0.14, "eval_loss": 0.8738257884979248, "eval_runtime": 19.3526, "eval_samples_per_second": 103.345, "eval_steps_per_second": 3.255, "step": 600 }, { "epoch": 0.15, "learning_rate": 0.0002876062604274251, "loss": 0.87, "step": 620 }, { "epoch": 0.15, "learning_rate": 0.00028712957813617224, "loss": 0.8715, "step": 640 }, { "epoch": 0.16, "learning_rate": 0.00028665289584491933, "loss": 0.8724, "step": 660 }, { "epoch": 0.16, "learning_rate": 0.00028617621355366647, "loss": 0.8741, "step": 680 }, { "epoch": 0.17, "learning_rate": 0.00028569953126241355, "loss": 0.8705, "step": 700 }, { "epoch": 0.17, "learning_rate": 0.0002852228489711607, "loss": 0.8702, "step": 720 }, { "epoch": 0.17, "learning_rate": 0.00028474616667990783, "loss": 0.8618, "step": 740 }, { "epoch": 0.18, "learning_rate": 0.0002842694843886549, "loss": 0.8617, "step": 760 }, { "epoch": 0.18, "learning_rate": 0.00028379280209740206, "loss": 0.8677, "step": 780 }, { "epoch": 0.19, "learning_rate": 0.0002833161198061492, "loss": 0.853, "step": 800 }, { "epoch": 0.19, "eval_loss": 0.8541846871376038, "eval_runtime": 19.49, "eval_samples_per_second": 102.617, "eval_steps_per_second": 3.232, "step": 800 }, { "epoch": 0.19, "learning_rate": 0.0002828394375148963, "loss": 0.8549, "step": 820 }, { "epoch": 0.2, "learning_rate": 0.0002823627552236434, "loss": 0.847, "step": 840 }, { "epoch": 0.2, "learning_rate": 0.00028188607293239056, "loss": 0.8585, "step": 860 }, { "epoch": 0.21, "learning_rate": 0.00028140939064113764, "loss": 0.8568, "step": 880 }, { "epoch": 0.21, "learning_rate": 0.0002809327083498848, "loss": 0.8482, "step": 900 }, { "epoch": 0.22, "learning_rate": 0.00028045602605863187, "loss": 0.845, "step": 920 }, { "epoch": 0.22, "learning_rate": 0.000279979343767379, "loss": 0.8548, "step": 940 }, { "epoch": 0.23, "learning_rate": 0.00027950266147612615, "loss": 0.8372, "step": 960 }, { "epoch": 0.23, "learning_rate": 0.0002790259791848733, "loss": 0.8423, "step": 980 }, { "epoch": 0.24, "learning_rate": 0.00027854929689362037, "loss": 0.8433, "step": 1000 }, { "epoch": 0.24, "eval_loss": 0.8396860361099243, "eval_runtime": 19.3459, "eval_samples_per_second": 103.381, "eval_steps_per_second": 3.257, "step": 1000 }, { "epoch": 0.24, "learning_rate": 0.0002780726146023675, "loss": 0.848, "step": 1020 }, { "epoch": 0.25, "learning_rate": 0.0002775959323111146, "loss": 0.8383, "step": 1040 }, { "epoch": 0.25, "learning_rate": 0.00027711925001986173, "loss": 0.8385, "step": 1060 }, { "epoch": 0.26, "learning_rate": 0.0002766425677286089, "loss": 0.8308, "step": 1080 }, { "epoch": 0.26, "learning_rate": 0.00027616588543735596, "loss": 0.8244, "step": 1100 }, { "epoch": 0.26, "learning_rate": 0.0002756892031461031, "loss": 0.835, "step": 1120 }, { "epoch": 0.27, "learning_rate": 0.0002752125208548502, "loss": 0.8337, "step": 1140 }, { "epoch": 0.27, "learning_rate": 0.0002747358385635974, "loss": 0.8348, "step": 1160 }, { "epoch": 0.28, "learning_rate": 0.00027425915627234446, "loss": 0.8353, "step": 1180 }, { "epoch": 0.28, "learning_rate": 0.0002737824739810916, "loss": 0.8294, "step": 1200 }, { "epoch": 0.28, "eval_loss": 0.8274422287940979, "eval_runtime": 19.4187, "eval_samples_per_second": 102.993, "eval_steps_per_second": 3.244, "step": 1200 }, { "epoch": 0.29, "learning_rate": 0.0002733057916898387, "loss": 0.8337, "step": 1220 }, { "epoch": 0.29, "learning_rate": 0.0002728291093985858, "loss": 0.8435, "step": 1240 }, { "epoch": 0.3, "learning_rate": 0.00027235242710733296, "loss": 0.8347, "step": 1260 }, { "epoch": 0.3, "learning_rate": 0.00027187574481608005, "loss": 0.8258, "step": 1280 }, { "epoch": 0.31, "learning_rate": 0.0002713990625248272, "loss": 0.8304, "step": 1300 }, { "epoch": 0.31, "learning_rate": 0.0002709223802335743, "loss": 0.8264, "step": 1320 }, { "epoch": 0.32, "learning_rate": 0.0002704456979423214, "loss": 0.8313, "step": 1340 }, { "epoch": 0.32, "learning_rate": 0.00026996901565106855, "loss": 0.814, "step": 1360 }, { "epoch": 0.33, "learning_rate": 0.0002694923333598157, "loss": 0.8223, "step": 1380 }, { "epoch": 0.33, "learning_rate": 0.0002690156510685628, "loss": 0.8159, "step": 1400 }, { "epoch": 0.33, "eval_loss": 0.8179089426994324, "eval_runtime": 19.4736, "eval_samples_per_second": 102.703, "eval_steps_per_second": 3.235, "step": 1400 }, { "epoch": 0.34, "learning_rate": 0.0002685389687773099, "loss": 0.8218, "step": 1420 }, { "epoch": 0.34, "learning_rate": 0.000268062286486057, "loss": 0.808, "step": 1440 }, { "epoch": 0.35, "learning_rate": 0.00026758560419480414, "loss": 0.8253, "step": 1460 }, { "epoch": 0.35, "learning_rate": 0.0002671089219035513, "loss": 0.8174, "step": 1480 }, { "epoch": 0.35, "learning_rate": 0.00026663223961229836, "loss": 0.8157, "step": 1500 }, { "epoch": 0.36, "learning_rate": 0.0002661555573210455, "loss": 0.8142, "step": 1520 }, { "epoch": 0.36, "learning_rate": 0.0002656788750297926, "loss": 0.8112, "step": 1540 }, { "epoch": 0.37, "learning_rate": 0.00026520219273853973, "loss": 0.8232, "step": 1560 }, { "epoch": 0.37, "learning_rate": 0.00026472551044728687, "loss": 0.8254, "step": 1580 }, { "epoch": 0.38, "learning_rate": 0.000264248828156034, "loss": 0.8059, "step": 1600 }, { "epoch": 0.38, "eval_loss": 0.8101135492324829, "eval_runtime": 19.5846, "eval_samples_per_second": 102.121, "eval_steps_per_second": 3.217, "step": 1600 }, { "epoch": 0.38, "learning_rate": 0.0002637721458647811, "loss": 0.8062, "step": 1620 }, { "epoch": 0.39, "learning_rate": 0.00026329546357352823, "loss": 0.805, "step": 1640 }, { "epoch": 0.39, "learning_rate": 0.0002628187812822753, "loss": 0.8109, "step": 1660 }, { "epoch": 0.4, "learning_rate": 0.00026234209899102245, "loss": 0.801, "step": 1680 }, { "epoch": 0.4, "learning_rate": 0.0002618654166997696, "loss": 0.8043, "step": 1700 }, { "epoch": 0.41, "learning_rate": 0.0002613887344085167, "loss": 0.8002, "step": 1720 }, { "epoch": 0.41, "learning_rate": 0.0002609120521172638, "loss": 0.8152, "step": 1740 }, { "epoch": 0.42, "learning_rate": 0.00026043536982601096, "loss": 0.8052, "step": 1760 }, { "epoch": 0.42, "learning_rate": 0.0002599586875347581, "loss": 0.8136, "step": 1780 }, { "epoch": 0.43, "learning_rate": 0.0002594820052435052, "loss": 0.8044, "step": 1800 }, { "epoch": 0.43, "eval_loss": 0.8030326962471008, "eval_runtime": 19.4835, "eval_samples_per_second": 102.651, "eval_steps_per_second": 3.234, "step": 1800 }, { "epoch": 0.43, "learning_rate": 0.0002590053229522523, "loss": 0.7995, "step": 1820 }, { "epoch": 0.44, "learning_rate": 0.0002585286406609994, "loss": 0.7958, "step": 1840 }, { "epoch": 0.44, "learning_rate": 0.00025805195836974654, "loss": 0.8034, "step": 1860 }, { "epoch": 0.44, "learning_rate": 0.00025757527607849363, "loss": 0.8016, "step": 1880 }, { "epoch": 0.45, "learning_rate": 0.00025709859378724077, "loss": 0.8048, "step": 1900 }, { "epoch": 0.45, "learning_rate": 0.0002566219114959879, "loss": 0.8004, "step": 1920 }, { "epoch": 0.46, "learning_rate": 0.00025614522920473505, "loss": 0.8041, "step": 1940 }, { "epoch": 0.46, "learning_rate": 0.00025566854691348213, "loss": 0.7908, "step": 1960 }, { "epoch": 0.47, "learning_rate": 0.00025519186462222927, "loss": 0.7958, "step": 1980 }, { "epoch": 0.47, "learning_rate": 0.0002547151823309764, "loss": 0.8013, "step": 2000 }, { "epoch": 0.47, "eval_loss": 0.7965430021286011, "eval_runtime": 19.4852, "eval_samples_per_second": 102.642, "eval_steps_per_second": 3.233, "step": 2000 }, { "epoch": 0.48, "learning_rate": 0.0002542385000397235, "loss": 0.803, "step": 2020 }, { "epoch": 0.48, "learning_rate": 0.00025376181774847064, "loss": 0.7966, "step": 2040 }, { "epoch": 0.49, "learning_rate": 0.0002532851354572177, "loss": 0.7946, "step": 2060 }, { "epoch": 0.49, "learning_rate": 0.00025280845316596486, "loss": 0.8023, "step": 2080 }, { "epoch": 0.5, "learning_rate": 0.00025233177087471194, "loss": 0.7953, "step": 2100 }, { "epoch": 0.5, "learning_rate": 0.0002518550885834591, "loss": 0.8053, "step": 2120 }, { "epoch": 0.51, "learning_rate": 0.0002513784062922062, "loss": 0.7883, "step": 2140 }, { "epoch": 0.51, "learning_rate": 0.00025090172400095336, "loss": 0.7984, "step": 2160 }, { "epoch": 0.52, "learning_rate": 0.00025042504170970045, "loss": 0.7962, "step": 2180 }, { "epoch": 0.52, "learning_rate": 0.0002499483594184476, "loss": 0.7847, "step": 2200 }, { "epoch": 0.52, "eval_loss": 0.7915623784065247, "eval_runtime": 19.5509, "eval_samples_per_second": 102.297, "eval_steps_per_second": 3.222, "step": 2200 }, { "epoch": 0.52, "learning_rate": 0.0002494716771271947, "loss": 0.7917, "step": 2220 }, { "epoch": 0.53, "learning_rate": 0.0002489949948359418, "loss": 0.7942, "step": 2240 }, { "epoch": 0.53, "learning_rate": 0.00024851831254468895, "loss": 0.7921, "step": 2260 }, { "epoch": 0.54, "learning_rate": 0.00024804163025343603, "loss": 0.7971, "step": 2280 }, { "epoch": 0.54, "learning_rate": 0.0002475649479621832, "loss": 0.7919, "step": 2300 }, { "epoch": 0.55, "learning_rate": 0.0002470882656709303, "loss": 0.7917, "step": 2320 }, { "epoch": 0.55, "learning_rate": 0.00024661158337967745, "loss": 0.8024, "step": 2340 }, { "epoch": 0.56, "learning_rate": 0.00024613490108842454, "loss": 0.7761, "step": 2360 }, { "epoch": 0.56, "learning_rate": 0.0002456582187971717, "loss": 0.7958, "step": 2380 }, { "epoch": 0.57, "learning_rate": 0.00024518153650591876, "loss": 0.7855, "step": 2400 }, { "epoch": 0.57, "eval_loss": 0.7870249152183533, "eval_runtime": 19.5953, "eval_samples_per_second": 102.065, "eval_steps_per_second": 3.215, "step": 2400 }, { "epoch": 0.57, "learning_rate": 0.0002447048542146659, "loss": 0.784, "step": 2420 }, { "epoch": 0.58, "learning_rate": 0.00024422817192341304, "loss": 0.7926, "step": 2440 }, { "epoch": 0.58, "learning_rate": 0.00024375148963216013, "loss": 0.7845, "step": 2460 }, { "epoch": 0.59, "learning_rate": 0.00024327480734090726, "loss": 0.782, "step": 2480 }, { "epoch": 0.59, "learning_rate": 0.00024279812504965438, "loss": 0.7808, "step": 2500 }, { "epoch": 0.6, "learning_rate": 0.00024232144275840152, "loss": 0.7926, "step": 2520 }, { "epoch": 0.6, "learning_rate": 0.00024184476046714863, "loss": 0.7795, "step": 2540 }, { "epoch": 0.61, "learning_rate": 0.00024136807817589574, "loss": 0.7888, "step": 2560 }, { "epoch": 0.61, "learning_rate": 0.00024089139588464288, "loss": 0.7888, "step": 2580 }, { "epoch": 0.61, "learning_rate": 0.00024041471359339, "loss": 0.7863, "step": 2600 }, { "epoch": 0.61, "eval_loss": 0.7825512290000916, "eval_runtime": 19.4274, "eval_samples_per_second": 102.948, "eval_steps_per_second": 3.243, "step": 2600 }, { "epoch": 0.62, "learning_rate": 0.0002399380313021371, "loss": 0.7881, "step": 2620 }, { "epoch": 0.62, "learning_rate": 0.00023946134901088422, "loss": 0.7841, "step": 2640 }, { "epoch": 0.63, "learning_rate": 0.00023898466671963133, "loss": 0.7849, "step": 2660 }, { "epoch": 0.63, "learning_rate": 0.00023850798442837844, "loss": 0.7809, "step": 2680 }, { "epoch": 0.64, "learning_rate": 0.0002380313021371256, "loss": 0.7757, "step": 2700 }, { "epoch": 0.64, "learning_rate": 0.00023755461984587272, "loss": 0.7787, "step": 2720 }, { "epoch": 0.65, "learning_rate": 0.00023707793755461983, "loss": 0.7766, "step": 2740 }, { "epoch": 0.65, "learning_rate": 0.00023660125526336694, "loss": 0.7867, "step": 2760 }, { "epoch": 0.66, "learning_rate": 0.00023612457297211405, "loss": 0.7767, "step": 2780 }, { "epoch": 0.66, "learning_rate": 0.0002356478906808612, "loss": 0.7806, "step": 2800 }, { "epoch": 0.66, "eval_loss": 0.7781409621238708, "eval_runtime": 20.131, "eval_samples_per_second": 99.349, "eval_steps_per_second": 3.13, "step": 2800 }, { "epoch": 0.67, "learning_rate": 0.0002351712083896083, "loss": 0.7774, "step": 2820 }, { "epoch": 0.67, "learning_rate": 0.00023469452609835542, "loss": 0.7782, "step": 2840 }, { "epoch": 0.68, "learning_rate": 0.00023421784380710253, "loss": 0.7773, "step": 2860 }, { "epoch": 0.68, "learning_rate": 0.00023374116151584964, "loss": 0.7845, "step": 2880 }, { "epoch": 0.69, "learning_rate": 0.0002332644792245968, "loss": 0.7879, "step": 2900 }, { "epoch": 0.69, "learning_rate": 0.00023278779693334392, "loss": 0.7801, "step": 2920 }, { "epoch": 0.7, "learning_rate": 0.00023231111464209103, "loss": 0.7713, "step": 2940 }, { "epoch": 0.7, "learning_rate": 0.00023183443235083814, "loss": 0.7742, "step": 2960 }, { "epoch": 0.7, "learning_rate": 0.00023135775005958526, "loss": 0.7783, "step": 2980 }, { "epoch": 0.71, "learning_rate": 0.0002308810677683324, "loss": 0.7698, "step": 3000 }, { "epoch": 0.71, "eval_loss": 0.7747411131858826, "eval_runtime": 20.0968, "eval_samples_per_second": 99.519, "eval_steps_per_second": 3.135, "step": 3000 }, { "epoch": 0.71, "learning_rate": 0.0002304043854770795, "loss": 0.7696, "step": 3020 }, { "epoch": 0.72, "learning_rate": 0.00022992770318582662, "loss": 0.7744, "step": 3040 }, { "epoch": 0.72, "learning_rate": 0.00022945102089457373, "loss": 0.7687, "step": 3060 }, { "epoch": 0.73, "learning_rate": 0.00022897433860332084, "loss": 0.7765, "step": 3080 }, { "epoch": 0.73, "learning_rate": 0.000228497656312068, "loss": 0.7709, "step": 3100 }, { "epoch": 0.74, "learning_rate": 0.00022802097402081512, "loss": 0.773, "step": 3120 }, { "epoch": 0.74, "learning_rate": 0.00022754429172956224, "loss": 0.7862, "step": 3140 }, { "epoch": 0.75, "learning_rate": 0.00022706760943830935, "loss": 0.7668, "step": 3160 }, { "epoch": 0.75, "learning_rate": 0.00022659092714705646, "loss": 0.7816, "step": 3180 }, { "epoch": 0.76, "learning_rate": 0.00022611424485580357, "loss": 0.7831, "step": 3200 }, { "epoch": 0.76, "eval_loss": 0.7719215154647827, "eval_runtime": 19.6387, "eval_samples_per_second": 101.84, "eval_steps_per_second": 3.208, "step": 3200 }, { "epoch": 0.76, "learning_rate": 0.0002256375625645507, "loss": 0.7723, "step": 3220 }, { "epoch": 0.77, "learning_rate": 0.00022516088027329782, "loss": 0.7727, "step": 3240 }, { "epoch": 0.77, "learning_rate": 0.00022468419798204493, "loss": 0.7719, "step": 3260 }, { "epoch": 0.78, "learning_rate": 0.00022420751569079207, "loss": 0.7796, "step": 3280 }, { "epoch": 0.78, "learning_rate": 0.0002237308333995392, "loss": 0.7685, "step": 3300 }, { "epoch": 0.79, "learning_rate": 0.00022325415110828633, "loss": 0.7725, "step": 3320 }, { "epoch": 0.79, "learning_rate": 0.00022277746881703344, "loss": 0.7638, "step": 3340 }, { "epoch": 0.79, "learning_rate": 0.00022230078652578055, "loss": 0.7771, "step": 3360 }, { "epoch": 0.8, "learning_rate": 0.00022182410423452766, "loss": 0.7689, "step": 3380 }, { "epoch": 0.8, "learning_rate": 0.00022134742194327477, "loss": 0.7797, "step": 3400 }, { "epoch": 0.8, "eval_loss": 0.768983006477356, "eval_runtime": 19.4428, "eval_samples_per_second": 102.866, "eval_steps_per_second": 3.24, "step": 3400 }, { "epoch": 0.81, "learning_rate": 0.0002208707396520219, "loss": 0.7734, "step": 3420 }, { "epoch": 0.81, "learning_rate": 0.00022039405736076903, "loss": 0.7719, "step": 3440 }, { "epoch": 0.82, "learning_rate": 0.00021991737506951614, "loss": 0.767, "step": 3460 }, { "epoch": 0.82, "learning_rate": 0.00021944069277826328, "loss": 0.7758, "step": 3480 }, { "epoch": 0.83, "learning_rate": 0.0002189640104870104, "loss": 0.7768, "step": 3500 }, { "epoch": 0.83, "learning_rate": 0.00021848732819575753, "loss": 0.7641, "step": 3520 }, { "epoch": 0.84, "learning_rate": 0.00021801064590450464, "loss": 0.7694, "step": 3540 }, { "epoch": 0.84, "learning_rate": 0.00021753396361325175, "loss": 0.7835, "step": 3560 }, { "epoch": 0.85, "learning_rate": 0.00021705728132199886, "loss": 0.7642, "step": 3580 }, { "epoch": 0.85, "learning_rate": 0.00021658059903074598, "loss": 0.7719, "step": 3600 }, { "epoch": 0.85, "eval_loss": 0.7660636305809021, "eval_runtime": 19.5996, "eval_samples_per_second": 102.043, "eval_steps_per_second": 3.214, "step": 3600 }, { "epoch": 0.86, "learning_rate": 0.0002161039167394931, "loss": 0.7723, "step": 3620 }, { "epoch": 0.86, "learning_rate": 0.00021562723444824023, "loss": 0.76, "step": 3640 }, { "epoch": 0.87, "learning_rate": 0.00021515055215698734, "loss": 0.7643, "step": 3660 }, { "epoch": 0.87, "learning_rate": 0.00021467386986573448, "loss": 0.7599, "step": 3680 }, { "epoch": 0.87, "learning_rate": 0.0002141971875744816, "loss": 0.7623, "step": 3700 }, { "epoch": 0.88, "learning_rate": 0.0002137205052832287, "loss": 0.7621, "step": 3720 }, { "epoch": 0.88, "learning_rate": 0.00021324382299197584, "loss": 0.7691, "step": 3740 }, { "epoch": 0.89, "learning_rate": 0.00021276714070072295, "loss": 0.7665, "step": 3760 }, { "epoch": 0.89, "learning_rate": 0.00021229045840947007, "loss": 0.7742, "step": 3780 }, { "epoch": 0.9, "learning_rate": 0.00021181377611821718, "loss": 0.7624, "step": 3800 }, { "epoch": 0.9, "eval_loss": 0.7643172740936279, "eval_runtime": 19.487, "eval_samples_per_second": 102.633, "eval_steps_per_second": 3.233, "step": 3800 }, { "epoch": 0.9, "learning_rate": 0.0002113370938269643, "loss": 0.7726, "step": 3820 }, { "epoch": 0.91, "learning_rate": 0.0002108604115357114, "loss": 0.7559, "step": 3840 }, { "epoch": 0.91, "learning_rate": 0.00021038372924445857, "loss": 0.7634, "step": 3860 }, { "epoch": 0.92, "learning_rate": 0.00020990704695320568, "loss": 0.765, "step": 3880 }, { "epoch": 0.92, "learning_rate": 0.0002094303646619528, "loss": 0.7649, "step": 3900 }, { "epoch": 0.93, "learning_rate": 0.0002089536823706999, "loss": 0.763, "step": 3920 }, { "epoch": 0.93, "learning_rate": 0.00020847700007944705, "loss": 0.7679, "step": 3940 }, { "epoch": 0.94, "learning_rate": 0.00020800031778819416, "loss": 0.7644, "step": 3960 }, { "epoch": 0.94, "learning_rate": 0.00020752363549694127, "loss": 0.7655, "step": 3980 }, { "epoch": 0.95, "learning_rate": 0.00020704695320568838, "loss": 0.7681, "step": 4000 }, { "epoch": 0.95, "eval_loss": 0.7610963582992554, "eval_runtime": 19.5269, "eval_samples_per_second": 102.423, "eval_steps_per_second": 3.226, "step": 4000 }, { "epoch": 0.95, "learning_rate": 0.0002065702709144355, "loss": 0.7623, "step": 4020 }, { "epoch": 0.96, "learning_rate": 0.0002060935886231826, "loss": 0.7625, "step": 4040 }, { "epoch": 0.96, "learning_rate": 0.00020561690633192977, "loss": 0.7524, "step": 4060 }, { "epoch": 0.96, "learning_rate": 0.00020514022404067688, "loss": 0.764, "step": 4080 }, { "epoch": 0.97, "learning_rate": 0.000204663541749424, "loss": 0.7513, "step": 4100 }, { "epoch": 0.97, "learning_rate": 0.0002041868594581711, "loss": 0.753, "step": 4120 }, { "epoch": 0.98, "learning_rate": 0.00020371017716691822, "loss": 0.7602, "step": 4140 }, { "epoch": 0.98, "learning_rate": 0.00020323349487566536, "loss": 0.7701, "step": 4160 }, { "epoch": 0.99, "learning_rate": 0.00020275681258441247, "loss": 0.7602, "step": 4180 }, { "epoch": 0.99, "learning_rate": 0.00020228013029315958, "loss": 0.7598, "step": 4200 }, { "epoch": 0.99, "eval_loss": 0.760128915309906, "eval_runtime": 19.4387, "eval_samples_per_second": 102.888, "eval_steps_per_second": 3.241, "step": 4200 }, { "epoch": 1.0, "learning_rate": 0.0002018034480019067, "loss": 0.7579, "step": 4220 }, { "epoch": 1.0, "learning_rate": 0.00020132676571065384, "loss": 0.7628, "step": 4240 }, { "epoch": 1.01, "learning_rate": 0.00020085008341940097, "loss": 0.7551, "step": 4260 }, { "epoch": 1.01, "learning_rate": 0.0002003734011281481, "loss": 0.7582, "step": 4280 }, { "epoch": 1.02, "learning_rate": 0.0001998967188368952, "loss": 0.7623, "step": 4300 }, { "epoch": 1.02, "learning_rate": 0.0001994200365456423, "loss": 0.7504, "step": 4320 }, { "epoch": 1.03, "learning_rate": 0.00019894335425438942, "loss": 0.7587, "step": 4340 }, { "epoch": 1.03, "learning_rate": 0.00019846667196313654, "loss": 0.7528, "step": 4360 }, { "epoch": 1.04, "learning_rate": 0.00019798998967188367, "loss": 0.754, "step": 4380 }, { "epoch": 1.04, "learning_rate": 0.00019751330738063079, "loss": 0.759, "step": 4400 }, { "epoch": 1.04, "eval_loss": 0.7575392127037048, "eval_runtime": 19.5275, "eval_samples_per_second": 102.42, "eval_steps_per_second": 3.226, "step": 4400 }, { "epoch": 1.05, "learning_rate": 0.0001970366250893779, "loss": 0.7592, "step": 4420 }, { "epoch": 1.05, "learning_rate": 0.00019655994279812504, "loss": 0.7548, "step": 4440 }, { "epoch": 1.05, "learning_rate": 0.00019608326050687218, "loss": 0.7632, "step": 4460 }, { "epoch": 1.06, "learning_rate": 0.0001956065782156193, "loss": 0.7472, "step": 4480 }, { "epoch": 1.06, "learning_rate": 0.0001951298959243664, "loss": 0.7496, "step": 4500 }, { "epoch": 1.07, "learning_rate": 0.0001946532136331135, "loss": 0.7549, "step": 4520 }, { "epoch": 1.07, "learning_rate": 0.00019417653134186063, "loss": 0.77, "step": 4540 }, { "epoch": 1.08, "learning_rate": 0.00019369984905060774, "loss": 0.759, "step": 4560 }, { "epoch": 1.08, "learning_rate": 0.00019322316675935488, "loss": 0.7554, "step": 4580 }, { "epoch": 1.09, "learning_rate": 0.000192746484468102, "loss": 0.7577, "step": 4600 }, { "epoch": 1.09, "eval_loss": 0.7568497061729431, "eval_runtime": 19.53, "eval_samples_per_second": 102.406, "eval_steps_per_second": 3.226, "step": 4600 }, { "epoch": 1.09, "learning_rate": 0.0001922698021768491, "loss": 0.7617, "step": 4620 }, { "epoch": 1.1, "learning_rate": 0.00019179311988559624, "loss": 0.7551, "step": 4640 }, { "epoch": 1.1, "learning_rate": 0.00019131643759434335, "loss": 0.7482, "step": 4660 }, { "epoch": 1.11, "learning_rate": 0.0001908397553030905, "loss": 0.7516, "step": 4680 }, { "epoch": 1.11, "learning_rate": 0.0001903630730118376, "loss": 0.7555, "step": 4700 }, { "epoch": 1.12, "learning_rate": 0.00018988639072058472, "loss": 0.7605, "step": 4720 }, { "epoch": 1.12, "learning_rate": 0.00018940970842933183, "loss": 0.7506, "step": 4740 }, { "epoch": 1.13, "learning_rate": 0.00018893302613807894, "loss": 0.7622, "step": 4760 }, { "epoch": 1.13, "learning_rate": 0.00018845634384682605, "loss": 0.75, "step": 4780 }, { "epoch": 1.13, "learning_rate": 0.0001879796615555732, "loss": 0.7572, "step": 4800 }, { "epoch": 1.13, "eval_loss": 0.7548028826713562, "eval_runtime": 19.5411, "eval_samples_per_second": 102.349, "eval_steps_per_second": 3.224, "step": 4800 }, { "epoch": 1.14, "learning_rate": 0.00018750297926432033, "loss": 0.7427, "step": 4820 }, { "epoch": 1.14, "learning_rate": 0.00018702629697306744, "loss": 0.7489, "step": 4840 }, { "epoch": 1.15, "learning_rate": 0.00018654961468181455, "loss": 0.755, "step": 4860 }, { "epoch": 1.15, "learning_rate": 0.00018607293239056167, "loss": 0.7517, "step": 4880 }, { "epoch": 1.16, "learning_rate": 0.0001855962500993088, "loss": 0.7529, "step": 4900 }, { "epoch": 1.16, "learning_rate": 0.00018511956780805592, "loss": 0.7498, "step": 4920 }, { "epoch": 1.17, "learning_rate": 0.00018464288551680303, "loss": 0.756, "step": 4940 }, { "epoch": 1.17, "learning_rate": 0.00018416620322555014, "loss": 0.7492, "step": 4960 }, { "epoch": 1.18, "learning_rate": 0.00018368952093429725, "loss": 0.7491, "step": 4980 }, { "epoch": 1.18, "learning_rate": 0.00018321283864304437, "loss": 0.7585, "step": 5000 }, { "epoch": 1.18, "eval_loss": 0.7538104057312012, "eval_runtime": 19.6106, "eval_samples_per_second": 101.986, "eval_steps_per_second": 3.213, "step": 5000 }, { "epoch": 1.19, "learning_rate": 0.00018273615635179153, "loss": 0.7531, "step": 5020 }, { "epoch": 1.19, "learning_rate": 0.00018225947406053865, "loss": 0.7511, "step": 5040 }, { "epoch": 1.2, "learning_rate": 0.00018178279176928576, "loss": 0.7541, "step": 5060 }, { "epoch": 1.2, "learning_rate": 0.00018130610947803287, "loss": 0.7465, "step": 5080 }, { "epoch": 1.21, "learning_rate": 0.00018082942718678, "loss": 0.7403, "step": 5100 }, { "epoch": 1.21, "learning_rate": 0.00018035274489552712, "loss": 0.749, "step": 5120 }, { "epoch": 1.22, "learning_rate": 0.00017987606260427423, "loss": 0.7548, "step": 5140 }, { "epoch": 1.22, "learning_rate": 0.00017939938031302134, "loss": 0.7443, "step": 5160 }, { "epoch": 1.22, "learning_rate": 0.00017892269802176846, "loss": 0.7461, "step": 5180 }, { "epoch": 1.23, "learning_rate": 0.00017844601573051557, "loss": 0.7511, "step": 5200 }, { "epoch": 1.23, "eval_loss": 0.7509217262268066, "eval_runtime": 19.5437, "eval_samples_per_second": 102.335, "eval_steps_per_second": 3.224, "step": 5200 }, { "epoch": 1.23, "learning_rate": 0.00017796933343926274, "loss": 0.7562, "step": 5220 }, { "epoch": 1.24, "learning_rate": 0.00017749265114800985, "loss": 0.7489, "step": 5240 }, { "epoch": 1.24, "learning_rate": 0.00017701596885675696, "loss": 0.7499, "step": 5260 }, { "epoch": 1.25, "learning_rate": 0.00017653928656550407, "loss": 0.7519, "step": 5280 }, { "epoch": 1.25, "learning_rate": 0.00017606260427425118, "loss": 0.7536, "step": 5300 }, { "epoch": 1.26, "learning_rate": 0.00017558592198299832, "loss": 0.7536, "step": 5320 }, { "epoch": 1.26, "learning_rate": 0.00017510923969174544, "loss": 0.7492, "step": 5340 }, { "epoch": 1.27, "learning_rate": 0.00017463255740049255, "loss": 0.7454, "step": 5360 }, { "epoch": 1.27, "learning_rate": 0.00017415587510923966, "loss": 0.7528, "step": 5380 }, { "epoch": 1.28, "learning_rate": 0.0001736791928179868, "loss": 0.7409, "step": 5400 }, { "epoch": 1.28, "eval_loss": 0.7497395873069763, "eval_runtime": 19.5671, "eval_samples_per_second": 102.212, "eval_steps_per_second": 3.22, "step": 5400 }, { "epoch": 1.28, "learning_rate": 0.00017320251052673394, "loss": 0.7434, "step": 5420 }, { "epoch": 1.29, "learning_rate": 0.00017272582823548105, "loss": 0.7543, "step": 5440 }, { "epoch": 1.29, "learning_rate": 0.00017224914594422816, "loss": 0.7457, "step": 5460 }, { "epoch": 1.3, "learning_rate": 0.00017177246365297527, "loss": 0.7439, "step": 5480 }, { "epoch": 1.3, "learning_rate": 0.0001712957813617224, "loss": 0.7412, "step": 5500 }, { "epoch": 1.31, "learning_rate": 0.0001708190990704695, "loss": 0.7409, "step": 5520 }, { "epoch": 1.31, "learning_rate": 0.00017034241677921664, "loss": 0.7473, "step": 5540 }, { "epoch": 1.31, "learning_rate": 0.00016986573448796375, "loss": 0.7486, "step": 5560 }, { "epoch": 1.32, "learning_rate": 0.00016938905219671086, "loss": 0.7439, "step": 5580 }, { "epoch": 1.32, "learning_rate": 0.000168912369905458, "loss": 0.7524, "step": 5600 }, { "epoch": 1.32, "eval_loss": 0.7480019330978394, "eval_runtime": 19.5018, "eval_samples_per_second": 102.555, "eval_steps_per_second": 3.23, "step": 5600 }, { "epoch": 1.33, "learning_rate": 0.00016843568761420514, "loss": 0.7464, "step": 5620 }, { "epoch": 1.33, "learning_rate": 0.00016795900532295225, "loss": 0.7511, "step": 5640 }, { "epoch": 1.34, "learning_rate": 0.00016748232303169936, "loss": 0.7423, "step": 5660 }, { "epoch": 1.34, "learning_rate": 0.00016700564074044648, "loss": 0.7422, "step": 5680 }, { "epoch": 1.35, "learning_rate": 0.0001665289584491936, "loss": 0.742, "step": 5700 }, { "epoch": 1.35, "learning_rate": 0.0001660522761579407, "loss": 0.7421, "step": 5720 }, { "epoch": 1.36, "learning_rate": 0.00016557559386668784, "loss": 0.749, "step": 5740 }, { "epoch": 1.36, "learning_rate": 0.00016509891157543495, "loss": 0.7432, "step": 5760 }, { "epoch": 1.37, "learning_rate": 0.0001646222292841821, "loss": 0.7426, "step": 5780 }, { "epoch": 1.37, "learning_rate": 0.0001641455469929292, "loss": 0.7543, "step": 5800 }, { "epoch": 1.37, "eval_loss": 0.7470090389251709, "eval_runtime": 19.5563, "eval_samples_per_second": 102.269, "eval_steps_per_second": 3.221, "step": 5800 }, { "epoch": 1.38, "learning_rate": 0.00016366886470167632, "loss": 0.7451, "step": 5820 }, { "epoch": 1.38, "learning_rate": 0.00016319218241042346, "loss": 0.7481, "step": 5840 }, { "epoch": 1.39, "learning_rate": 0.00016271550011917057, "loss": 0.7381, "step": 5860 }, { "epoch": 1.39, "learning_rate": 0.00016223881782791768, "loss": 0.7461, "step": 5880 }, { "epoch": 1.4, "learning_rate": 0.0001617621355366648, "loss": 0.7467, "step": 5900 }, { "epoch": 1.4, "learning_rate": 0.0001612854532454119, "loss": 0.745, "step": 5920 }, { "epoch": 1.4, "learning_rate": 0.00016080877095415902, "loss": 0.745, "step": 5940 }, { "epoch": 1.41, "learning_rate": 0.00016033208866290615, "loss": 0.7386, "step": 5960 }, { "epoch": 1.41, "learning_rate": 0.0001598554063716533, "loss": 0.7363, "step": 5980 }, { "epoch": 1.42, "learning_rate": 0.0001593787240804004, "loss": 0.7412, "step": 6000 }, { "epoch": 1.42, "eval_loss": 0.7454522848129272, "eval_runtime": 19.555, "eval_samples_per_second": 102.276, "eval_steps_per_second": 3.222, "step": 6000 }, { "epoch": 1.42, "learning_rate": 0.00015890204178914752, "loss": 0.7501, "step": 6020 }, { "epoch": 1.43, "learning_rate": 0.00015842535949789463, "loss": 0.7528, "step": 6040 }, { "epoch": 1.43, "learning_rate": 0.00015794867720664177, "loss": 0.7373, "step": 6060 }, { "epoch": 1.44, "learning_rate": 0.00015747199491538888, "loss": 0.7451, "step": 6080 }, { "epoch": 1.44, "learning_rate": 0.000156995312624136, "loss": 0.7384, "step": 6100 }, { "epoch": 1.45, "learning_rate": 0.0001565186303328831, "loss": 0.7471, "step": 6120 }, { "epoch": 1.45, "learning_rate": 0.00015604194804163022, "loss": 0.7454, "step": 6140 }, { "epoch": 1.46, "learning_rate": 0.00015556526575037733, "loss": 0.7415, "step": 6160 }, { "epoch": 1.46, "learning_rate": 0.0001550885834591245, "loss": 0.7514, "step": 6180 }, { "epoch": 1.47, "learning_rate": 0.0001546119011678716, "loss": 0.7343, "step": 6200 }, { "epoch": 1.47, "eval_loss": 0.7457332611083984, "eval_runtime": 19.5673, "eval_samples_per_second": 102.212, "eval_steps_per_second": 3.22, "step": 6200 }, { "epoch": 1.47, "learning_rate": 0.00015413521887661872, "loss": 0.7452, "step": 6220 }, { "epoch": 1.48, "learning_rate": 0.00015365853658536583, "loss": 0.7456, "step": 6240 }, { "epoch": 1.48, "learning_rate": 0.00015318185429411297, "loss": 0.7326, "step": 6260 }, { "epoch": 1.48, "learning_rate": 0.00015270517200286008, "loss": 0.7431, "step": 6280 }, { "epoch": 1.49, "learning_rate": 0.0001522284897116072, "loss": 0.7419, "step": 6300 }, { "epoch": 1.49, "learning_rate": 0.0001517518074203543, "loss": 0.7375, "step": 6320 }, { "epoch": 1.5, "learning_rate": 0.00015127512512910142, "loss": 0.7419, "step": 6340 }, { "epoch": 1.5, "learning_rate": 0.0001507984428378486, "loss": 0.7431, "step": 6360 }, { "epoch": 1.51, "learning_rate": 0.0001503217605465957, "loss": 0.7412, "step": 6380 }, { "epoch": 1.51, "learning_rate": 0.00014984507825534278, "loss": 0.7447, "step": 6400 }, { "epoch": 1.51, "eval_loss": 0.7441338896751404, "eval_runtime": 19.4509, "eval_samples_per_second": 102.823, "eval_steps_per_second": 3.239, "step": 6400 }, { "epoch": 1.52, "learning_rate": 0.00014936839596408992, "loss": 0.7436, "step": 6420 }, { "epoch": 1.52, "learning_rate": 0.00014889171367283704, "loss": 0.7402, "step": 6440 }, { "epoch": 1.53, "learning_rate": 0.00014841503138158415, "loss": 0.7454, "step": 6460 }, { "epoch": 1.53, "learning_rate": 0.0001479383490903313, "loss": 0.738, "step": 6480 }, { "epoch": 1.54, "learning_rate": 0.0001474616667990784, "loss": 0.7396, "step": 6500 }, { "epoch": 1.54, "learning_rate": 0.00014698498450782554, "loss": 0.7333, "step": 6520 }, { "epoch": 1.55, "learning_rate": 0.00014650830221657265, "loss": 0.7482, "step": 6540 }, { "epoch": 1.55, "learning_rate": 0.00014603161992531976, "loss": 0.7376, "step": 6560 }, { "epoch": 1.56, "learning_rate": 0.00014555493763406687, "loss": 0.7369, "step": 6580 }, { "epoch": 1.56, "learning_rate": 0.00014507825534281401, "loss": 0.7347, "step": 6600 }, { "epoch": 1.56, "eval_loss": 0.7425362467765808, "eval_runtime": 19.5248, "eval_samples_per_second": 102.434, "eval_steps_per_second": 3.227, "step": 6600 }, { "epoch": 1.57, "learning_rate": 0.00014460157305156113, "loss": 0.7446, "step": 6620 }, { "epoch": 1.57, "learning_rate": 0.00014412489076030824, "loss": 0.7343, "step": 6640 }, { "epoch": 1.57, "learning_rate": 0.00014364820846905535, "loss": 0.7468, "step": 6660 }, { "epoch": 1.58, "learning_rate": 0.0001431715261778025, "loss": 0.749, "step": 6680 }, { "epoch": 1.58, "learning_rate": 0.0001426948438865496, "loss": 0.7401, "step": 6700 }, { "epoch": 1.59, "learning_rate": 0.0001422181615952967, "loss": 0.7364, "step": 6720 }, { "epoch": 1.59, "learning_rate": 0.00014174147930404385, "loss": 0.7442, "step": 6740 }, { "epoch": 1.6, "learning_rate": 0.00014126479701279096, "loss": 0.7385, "step": 6760 }, { "epoch": 1.6, "learning_rate": 0.00014078811472153808, "loss": 0.7412, "step": 6780 }, { "epoch": 1.61, "learning_rate": 0.00014031143243028522, "loss": 0.7377, "step": 6800 }, { "epoch": 1.61, "eval_loss": 0.7418386936187744, "eval_runtime": 19.5679, "eval_samples_per_second": 102.208, "eval_steps_per_second": 3.22, "step": 6800 }, { "epoch": 1.61, "learning_rate": 0.00013983475013903233, "loss": 0.7432, "step": 6820 }, { "epoch": 1.62, "learning_rate": 0.00013935806784777944, "loss": 0.7379, "step": 6840 }, { "epoch": 1.62, "learning_rate": 0.00013888138555652655, "loss": 0.7346, "step": 6860 }, { "epoch": 1.63, "learning_rate": 0.00013840470326527366, "loss": 0.7373, "step": 6880 }, { "epoch": 1.63, "learning_rate": 0.0001379280209740208, "loss": 0.7403, "step": 6900 }, { "epoch": 1.64, "learning_rate": 0.00013745133868276792, "loss": 0.7477, "step": 6920 }, { "epoch": 1.64, "learning_rate": 0.00013697465639151506, "loss": 0.7343, "step": 6940 }, { "epoch": 1.65, "learning_rate": 0.00013649797410026217, "loss": 0.7419, "step": 6960 }, { "epoch": 1.65, "learning_rate": 0.00013602129180900928, "loss": 0.7327, "step": 6980 }, { "epoch": 1.66, "learning_rate": 0.00013554460951775642, "loss": 0.7398, "step": 7000 }, { "epoch": 1.66, "eval_loss": 0.7402775883674622, "eval_runtime": 19.5554, "eval_samples_per_second": 102.274, "eval_steps_per_second": 3.222, "step": 7000 }, { "epoch": 1.66, "learning_rate": 0.00013506792722650353, "loss": 0.7311, "step": 7020 }, { "epoch": 1.66, "learning_rate": 0.00013459124493525064, "loss": 0.7319, "step": 7040 }, { "epoch": 1.67, "learning_rate": 0.00013411456264399775, "loss": 0.7315, "step": 7060 }, { "epoch": 1.67, "learning_rate": 0.0001336378803527449, "loss": 0.7329, "step": 7080 }, { "epoch": 1.68, "learning_rate": 0.000133161198061492, "loss": 0.7471, "step": 7100 }, { "epoch": 1.68, "learning_rate": 0.00013268451577023912, "loss": 0.7446, "step": 7120 }, { "epoch": 1.69, "learning_rate": 0.00013220783347898623, "loss": 0.7359, "step": 7140 }, { "epoch": 1.69, "learning_rate": 0.00013173115118773337, "loss": 0.7348, "step": 7160 }, { "epoch": 1.7, "learning_rate": 0.00013125446889648048, "loss": 0.7331, "step": 7180 }, { "epoch": 1.7, "learning_rate": 0.00013077778660522762, "loss": 0.7385, "step": 7200 }, { "epoch": 1.7, "eval_loss": 0.7401012182235718, "eval_runtime": 19.7831, "eval_samples_per_second": 101.096, "eval_steps_per_second": 3.185, "step": 7200 }, { "epoch": 1.71, "learning_rate": 0.00013030110431397473, "loss": 0.744, "step": 7220 }, { "epoch": 1.71, "learning_rate": 0.00012982442202272185, "loss": 0.7327, "step": 7240 }, { "epoch": 1.72, "learning_rate": 0.00012934773973146896, "loss": 0.7384, "step": 7260 }, { "epoch": 1.72, "learning_rate": 0.0001288710574402161, "loss": 0.7399, "step": 7280 }, { "epoch": 1.73, "learning_rate": 0.0001283943751489632, "loss": 0.7376, "step": 7300 }, { "epoch": 1.73, "learning_rate": 0.00012791769285771032, "loss": 0.7416, "step": 7320 }, { "epoch": 1.74, "learning_rate": 0.00012744101056645743, "loss": 0.7299, "step": 7340 }, { "epoch": 1.74, "learning_rate": 0.00012696432827520455, "loss": 0.7389, "step": 7360 }, { "epoch": 1.75, "learning_rate": 0.00012648764598395168, "loss": 0.7295, "step": 7380 }, { "epoch": 1.75, "learning_rate": 0.0001260109636926988, "loss": 0.7389, "step": 7400 }, { "epoch": 1.75, "eval_loss": 0.7385362386703491, "eval_runtime": 19.6728, "eval_samples_per_second": 101.663, "eval_steps_per_second": 3.202, "step": 7400 }, { "epoch": 1.75, "learning_rate": 0.00012553428140144594, "loss": 0.7346, "step": 7420 }, { "epoch": 1.76, "learning_rate": 0.00012505759911019305, "loss": 0.7357, "step": 7440 }, { "epoch": 1.76, "learning_rate": 0.00012458091681894016, "loss": 0.7295, "step": 7460 }, { "epoch": 1.77, "learning_rate": 0.0001241042345276873, "loss": 0.7418, "step": 7480 }, { "epoch": 1.77, "learning_rate": 0.0001236275522364344, "loss": 0.7248, "step": 7500 }, { "epoch": 1.78, "learning_rate": 0.00012315086994518152, "loss": 0.7326, "step": 7520 }, { "epoch": 1.78, "learning_rate": 0.00012267418765392864, "loss": 0.7422, "step": 7540 }, { "epoch": 1.79, "learning_rate": 0.00012219750536267577, "loss": 0.7376, "step": 7560 }, { "epoch": 1.79, "learning_rate": 0.00012172082307142289, "loss": 0.7358, "step": 7580 }, { "epoch": 1.8, "learning_rate": 0.00012124414078017001, "loss": 0.7337, "step": 7600 }, { "epoch": 1.8, "eval_loss": 0.737734854221344, "eval_runtime": 19.8317, "eval_samples_per_second": 100.849, "eval_steps_per_second": 3.177, "step": 7600 }, { "epoch": 1.8, "learning_rate": 0.00012076745848891712, "loss": 0.7318, "step": 7620 }, { "epoch": 1.81, "learning_rate": 0.00012029077619766424, "loss": 0.7356, "step": 7640 }, { "epoch": 1.81, "learning_rate": 0.00011981409390641138, "loss": 0.7355, "step": 7660 }, { "epoch": 1.82, "learning_rate": 0.00011933741161515849, "loss": 0.74, "step": 7680 }, { "epoch": 1.82, "learning_rate": 0.0001188607293239056, "loss": 0.7342, "step": 7700 }, { "epoch": 1.83, "learning_rate": 0.00011838404703265273, "loss": 0.7368, "step": 7720 }, { "epoch": 1.83, "learning_rate": 0.00011790736474139984, "loss": 0.7337, "step": 7740 }, { "epoch": 1.83, "learning_rate": 0.00011743068245014698, "loss": 0.7317, "step": 7760 }, { "epoch": 1.84, "learning_rate": 0.00011695400015889409, "loss": 0.738, "step": 7780 }, { "epoch": 1.84, "learning_rate": 0.0001164773178676412, "loss": 0.7375, "step": 7800 }, { "epoch": 1.84, "eval_loss": 0.7366506457328796, "eval_runtime": 19.9586, "eval_samples_per_second": 100.208, "eval_steps_per_second": 3.157, "step": 7800 }, { "epoch": 1.85, "learning_rate": 0.00011600063557638833, "loss": 0.7349, "step": 7820 }, { "epoch": 1.85, "learning_rate": 0.00011552395328513544, "loss": 0.733, "step": 7840 }, { "epoch": 1.86, "learning_rate": 0.00011504727099388258, "loss": 0.7277, "step": 7860 }, { "epoch": 1.86, "learning_rate": 0.00011457058870262969, "loss": 0.7235, "step": 7880 }, { "epoch": 1.87, "learning_rate": 0.0001140939064113768, "loss": 0.7405, "step": 7900 }, { "epoch": 1.87, "learning_rate": 0.00011361722412012393, "loss": 0.7378, "step": 7920 }, { "epoch": 1.88, "learning_rate": 0.00011314054182887104, "loss": 0.7292, "step": 7940 }, { "epoch": 1.88, "learning_rate": 0.00011266385953761818, "loss": 0.7427, "step": 7960 }, { "epoch": 1.89, "learning_rate": 0.00011218717724636529, "loss": 0.7313, "step": 7980 }, { "epoch": 1.89, "learning_rate": 0.0001117104949551124, "loss": 0.7252, "step": 8000 }, { "epoch": 1.89, "eval_loss": 0.736083984375, "eval_runtime": 19.7958, "eval_samples_per_second": 101.031, "eval_steps_per_second": 3.182, "step": 8000 }, { "epoch": 1.9, "learning_rate": 0.00011123381266385953, "loss": 0.7268, "step": 8020 }, { "epoch": 1.9, "learning_rate": 0.00011075713037260666, "loss": 0.729, "step": 8040 }, { "epoch": 1.91, "learning_rate": 0.00011028044808135377, "loss": 0.7358, "step": 8060 }, { "epoch": 1.91, "learning_rate": 0.00010980376579010089, "loss": 0.7408, "step": 8080 }, { "epoch": 1.92, "learning_rate": 0.000109327083498848, "loss": 0.73, "step": 8100 }, { "epoch": 1.92, "learning_rate": 0.00010887423532215777, "loss": 0.7298, "step": 8120 }, { "epoch": 1.92, "learning_rate": 0.0001083975530309049, "loss": 0.7324, "step": 8140 }, { "epoch": 1.93, "learning_rate": 0.00010792087073965201, "loss": 0.7296, "step": 8160 }, { "epoch": 1.93, "learning_rate": 0.00010744418844839912, "loss": 0.7346, "step": 8180 }, { "epoch": 1.94, "learning_rate": 0.00010696750615714626, "loss": 0.7281, "step": 8200 }, { "epoch": 1.94, "eval_loss": 0.7352190613746643, "eval_runtime": 19.6635, "eval_samples_per_second": 101.711, "eval_steps_per_second": 3.204, "step": 8200 }, { "epoch": 1.94, "learning_rate": 0.00010649082386589337, "loss": 0.7377, "step": 8220 }, { "epoch": 1.95, "learning_rate": 0.0001060141415746405, "loss": 0.7281, "step": 8240 }, { "epoch": 1.95, "learning_rate": 0.00010553745928338761, "loss": 0.7251, "step": 8260 }, { "epoch": 1.96, "learning_rate": 0.00010506077699213472, "loss": 0.7331, "step": 8280 }, { "epoch": 1.96, "learning_rate": 0.00010458409470088186, "loss": 0.7432, "step": 8300 }, { "epoch": 1.97, "learning_rate": 0.00010410741240962897, "loss": 0.7366, "step": 8320 }, { "epoch": 1.97, "learning_rate": 0.0001036307301183761, "loss": 0.7334, "step": 8340 }, { "epoch": 1.98, "learning_rate": 0.00010315404782712321, "loss": 0.7351, "step": 8360 }, { "epoch": 1.98, "learning_rate": 0.00010267736553587032, "loss": 0.7355, "step": 8380 }, { "epoch": 1.99, "learning_rate": 0.00010220068324461746, "loss": 0.7228, "step": 8400 }, { "epoch": 1.99, "eval_loss": 0.7341500520706177, "eval_runtime": 19.6196, "eval_samples_per_second": 101.939, "eval_steps_per_second": 3.211, "step": 8400 }, { "epoch": 1.99, "learning_rate": 0.00010172400095336457, "loss": 0.7451, "step": 8420 }, { "epoch": 2.0, "learning_rate": 0.00010124731866211169, "loss": 0.7356, "step": 8440 }, { "epoch": 2.0, "learning_rate": 0.00010077063637085881, "loss": 0.7255, "step": 8460 }, { "epoch": 2.01, "learning_rate": 0.00010029395407960592, "loss": 0.7267, "step": 8480 }, { "epoch": 2.01, "learning_rate": 9.981727178835306e-05, "loss": 0.7291, "step": 8500 }, { "epoch": 2.01, "learning_rate": 9.934058949710018e-05, "loss": 0.7294, "step": 8520 }, { "epoch": 2.02, "learning_rate": 9.886390720584729e-05, "loss": 0.7377, "step": 8540 }, { "epoch": 2.02, "learning_rate": 9.838722491459441e-05, "loss": 0.7324, "step": 8560 }, { "epoch": 2.03, "learning_rate": 9.791054262334154e-05, "loss": 0.7286, "step": 8580 }, { "epoch": 2.03, "learning_rate": 9.743386033208867e-05, "loss": 0.7286, "step": 8600 }, { "epoch": 2.03, "eval_loss": 0.734474241733551, "eval_runtime": 19.5642, "eval_samples_per_second": 102.228, "eval_steps_per_second": 3.22, "step": 8600 }, { "epoch": 2.04, "learning_rate": 9.695717804083578e-05, "loss": 0.7304, "step": 8620 }, { "epoch": 2.04, "learning_rate": 9.648049574958289e-05, "loss": 0.7348, "step": 8640 }, { "epoch": 2.05, "learning_rate": 9.600381345833002e-05, "loss": 0.7261, "step": 8660 }, { "epoch": 2.05, "learning_rate": 9.552713116707714e-05, "loss": 0.7313, "step": 8680 }, { "epoch": 2.06, "learning_rate": 9.505044887582425e-05, "loss": 0.7379, "step": 8700 }, { "epoch": 2.06, "learning_rate": 9.457376658457138e-05, "loss": 0.7203, "step": 8720 }, { "epoch": 2.07, "learning_rate": 9.409708429331849e-05, "loss": 0.7306, "step": 8740 }, { "epoch": 2.07, "learning_rate": 9.36204020020656e-05, "loss": 0.7332, "step": 8760 }, { "epoch": 2.08, "learning_rate": 9.314371971081274e-05, "loss": 0.7228, "step": 8780 }, { "epoch": 2.08, "learning_rate": 9.266703741955985e-05, "loss": 0.731, "step": 8800 }, { "epoch": 2.08, "eval_loss": 0.7332338690757751, "eval_runtime": 19.7114, "eval_samples_per_second": 101.464, "eval_steps_per_second": 3.196, "step": 8800 }, { "epoch": 2.09, "learning_rate": 9.219035512830698e-05, "loss": 0.7267, "step": 8820 }, { "epoch": 2.09, "learning_rate": 9.171367283705409e-05, "loss": 0.7285, "step": 8840 }, { "epoch": 2.09, "learning_rate": 9.12369905458012e-05, "loss": 0.7214, "step": 8860 }, { "epoch": 2.1, "learning_rate": 9.076030825454834e-05, "loss": 0.7204, "step": 8880 }, { "epoch": 2.1, "learning_rate": 9.028362596329546e-05, "loss": 0.7253, "step": 8900 }, { "epoch": 2.11, "learning_rate": 8.980694367204258e-05, "loss": 0.7253, "step": 8920 }, { "epoch": 2.11, "learning_rate": 8.933026138078969e-05, "loss": 0.7238, "step": 8940 }, { "epoch": 2.12, "learning_rate": 8.88535790895368e-05, "loss": 0.7286, "step": 8960 }, { "epoch": 2.12, "learning_rate": 8.837689679828394e-05, "loss": 0.7385, "step": 8980 }, { "epoch": 2.13, "learning_rate": 8.790021450703106e-05, "loss": 0.7237, "step": 9000 }, { "epoch": 2.13, "eval_loss": 0.7329864501953125, "eval_runtime": 19.7024, "eval_samples_per_second": 101.51, "eval_steps_per_second": 3.198, "step": 9000 }, { "epoch": 2.13, "learning_rate": 8.742353221577817e-05, "loss": 0.7311, "step": 9020 }, { "epoch": 2.14, "learning_rate": 8.69468499245253e-05, "loss": 0.7374, "step": 9040 }, { "epoch": 2.14, "learning_rate": 8.64701676332724e-05, "loss": 0.7194, "step": 9060 }, { "epoch": 2.15, "learning_rate": 8.599348534201955e-05, "loss": 0.7237, "step": 9080 }, { "epoch": 2.15, "learning_rate": 8.551680305076666e-05, "loss": 0.7287, "step": 9100 }, { "epoch": 2.16, "learning_rate": 8.504012075951377e-05, "loss": 0.7385, "step": 9120 }, { "epoch": 2.16, "learning_rate": 8.45634384682609e-05, "loss": 0.7319, "step": 9140 }, { "epoch": 2.17, "learning_rate": 8.408675617700802e-05, "loss": 0.7278, "step": 9160 }, { "epoch": 2.17, "learning_rate": 8.361007388575515e-05, "loss": 0.7293, "step": 9180 }, { "epoch": 2.18, "learning_rate": 8.313339159450226e-05, "loss": 0.7232, "step": 9200 }, { "epoch": 2.18, "eval_loss": 0.7326176762580872, "eval_runtime": 20.1581, "eval_samples_per_second": 99.215, "eval_steps_per_second": 3.125, "step": 9200 }, { "epoch": 2.18, "learning_rate": 8.265670930324937e-05, "loss": 0.7281, "step": 9220 }, { "epoch": 2.18, "learning_rate": 8.21800270119965e-05, "loss": 0.728, "step": 9240 }, { "epoch": 2.19, "learning_rate": 8.170334472074362e-05, "loss": 0.728, "step": 9260 }, { "epoch": 2.19, "learning_rate": 8.122666242949073e-05, "loss": 0.7221, "step": 9280 }, { "epoch": 2.2, "learning_rate": 8.074998013823786e-05, "loss": 0.7242, "step": 9300 }, { "epoch": 2.2, "learning_rate": 8.027329784698497e-05, "loss": 0.7306, "step": 9320 }, { "epoch": 2.21, "learning_rate": 7.979661555573208e-05, "loss": 0.7218, "step": 9340 }, { "epoch": 2.21, "learning_rate": 7.931993326447922e-05, "loss": 0.7289, "step": 9360 }, { "epoch": 2.22, "learning_rate": 7.884325097322634e-05, "loss": 0.7177, "step": 9380 }, { "epoch": 2.22, "learning_rate": 7.836656868197346e-05, "loss": 0.7265, "step": 9400 }, { "epoch": 2.22, "eval_loss": 0.7311453819274902, "eval_runtime": 19.9076, "eval_samples_per_second": 100.464, "eval_steps_per_second": 3.165, "step": 9400 }, { "epoch": 2.23, "learning_rate": 7.788988639072057e-05, "loss": 0.7269, "step": 9420 }, { "epoch": 2.23, "learning_rate": 7.741320409946769e-05, "loss": 0.7275, "step": 9440 }, { "epoch": 2.24, "learning_rate": 7.693652180821483e-05, "loss": 0.7317, "step": 9460 }, { "epoch": 2.24, "learning_rate": 7.645983951696194e-05, "loss": 0.7344, "step": 9480 }, { "epoch": 2.25, "learning_rate": 7.598315722570906e-05, "loss": 0.7263, "step": 9500 }, { "epoch": 2.25, "learning_rate": 7.550647493445617e-05, "loss": 0.7299, "step": 9520 }, { "epoch": 2.26, "learning_rate": 7.502979264320329e-05, "loss": 0.724, "step": 9540 }, { "epoch": 2.26, "learning_rate": 7.455311035195041e-05, "loss": 0.7266, "step": 9560 }, { "epoch": 2.27, "learning_rate": 7.407642806069754e-05, "loss": 0.7299, "step": 9580 }, { "epoch": 2.27, "learning_rate": 7.359974576944465e-05, "loss": 0.7236, "step": 9600 }, { "epoch": 2.27, "eval_loss": 0.7311366200447083, "eval_runtime": 20.0053, "eval_samples_per_second": 99.973, "eval_steps_per_second": 3.149, "step": 9600 }, { "epoch": 2.27, "learning_rate": 7.314689759275442e-05, "loss": 0.7252, "step": 9620 }, { "epoch": 2.28, "learning_rate": 7.267021530150154e-05, "loss": 0.7252, "step": 9640 }, { "epoch": 2.28, "learning_rate": 7.219353301024865e-05, "loss": 0.7188, "step": 9660 }, { "epoch": 2.29, "learning_rate": 7.171685071899578e-05, "loss": 0.7243, "step": 9680 }, { "epoch": 2.29, "learning_rate": 7.12401684277429e-05, "loss": 0.7298, "step": 9700 }, { "epoch": 2.3, "learning_rate": 7.076348613649002e-05, "loss": 0.7325, "step": 9720 }, { "epoch": 2.3, "learning_rate": 7.028680384523714e-05, "loss": 0.7286, "step": 9740 }, { "epoch": 2.31, "learning_rate": 6.981012155398426e-05, "loss": 0.7201, "step": 9760 }, { "epoch": 2.31, "learning_rate": 6.933343926273138e-05, "loss": 0.7184, "step": 9780 }, { "epoch": 2.32, "learning_rate": 6.885675697147851e-05, "loss": 0.7291, "step": 9800 }, { "epoch": 2.32, "eval_loss": 0.7308618426322937, "eval_runtime": 19.7965, "eval_samples_per_second": 101.028, "eval_steps_per_second": 3.182, "step": 9800 }, { "epoch": 2.32, "learning_rate": 6.838007468022563e-05, "loss": 0.7318, "step": 9820 }, { "epoch": 2.33, "learning_rate": 6.790339238897274e-05, "loss": 0.7227, "step": 9840 }, { "epoch": 2.33, "learning_rate": 6.742671009771986e-05, "loss": 0.7377, "step": 9860 }, { "epoch": 2.34, "learning_rate": 6.695002780646698e-05, "loss": 0.7367, "step": 9880 }, { "epoch": 2.34, "learning_rate": 6.647334551521411e-05, "loss": 0.7218, "step": 9900 }, { "epoch": 2.35, "learning_rate": 6.599666322396122e-05, "loss": 0.7282, "step": 9920 }, { "epoch": 2.35, "learning_rate": 6.551998093270835e-05, "loss": 0.7231, "step": 9940 }, { "epoch": 2.36, "learning_rate": 6.504329864145546e-05, "loss": 0.7257, "step": 9960 }, { "epoch": 2.36, "learning_rate": 6.456661635020258e-05, "loss": 0.7275, "step": 9980 }, { "epoch": 2.36, "learning_rate": 6.40899340589497e-05, "loss": 0.725, "step": 10000 }, { "epoch": 2.36, "eval_loss": 0.7301817536354065, "eval_runtime": 19.7914, "eval_samples_per_second": 101.054, "eval_steps_per_second": 3.183, "step": 10000 }, { "epoch": 2.37, "learning_rate": 6.361325176769682e-05, "loss": 0.72, "step": 10020 }, { "epoch": 2.37, "learning_rate": 6.313656947644395e-05, "loss": 0.7267, "step": 10040 }, { "epoch": 2.38, "learning_rate": 6.265988718519107e-05, "loss": 0.7276, "step": 10060 }, { "epoch": 2.38, "learning_rate": 6.218320489393818e-05, "loss": 0.7262, "step": 10080 }, { "epoch": 2.39, "learning_rate": 6.17065226026853e-05, "loss": 0.7149, "step": 10100 }, { "epoch": 2.39, "learning_rate": 6.122984031143242e-05, "loss": 0.7305, "step": 10120 }, { "epoch": 2.4, "learning_rate": 6.075315802017954e-05, "loss": 0.7314, "step": 10140 }, { "epoch": 2.4, "learning_rate": 6.027647572892667e-05, "loss": 0.7154, "step": 10160 }, { "epoch": 2.41, "learning_rate": 5.9799793437673786e-05, "loss": 0.7263, "step": 10180 }, { "epoch": 2.41, "learning_rate": 5.93231111464209e-05, "loss": 0.7203, "step": 10200 }, { "epoch": 2.41, "eval_loss": 0.7294782996177673, "eval_runtime": 19.7824, "eval_samples_per_second": 101.1, "eval_steps_per_second": 3.185, "step": 10200 }, { "epoch": 2.42, "learning_rate": 5.8846428855168024e-05, "loss": 0.7208, "step": 10220 }, { "epoch": 2.42, "learning_rate": 5.836974656391514e-05, "loss": 0.7266, "step": 10240 }, { "epoch": 2.43, "learning_rate": 5.789306427266227e-05, "loss": 0.7285, "step": 10260 }, { "epoch": 2.43, "learning_rate": 5.741638198140939e-05, "loss": 0.7215, "step": 10280 }, { "epoch": 2.44, "learning_rate": 5.6939699690156506e-05, "loss": 0.7203, "step": 10300 }, { "epoch": 2.44, "learning_rate": 5.6463017398903625e-05, "loss": 0.7314, "step": 10320 }, { "epoch": 2.44, "learning_rate": 5.5986335107650744e-05, "loss": 0.7394, "step": 10340 }, { "epoch": 2.45, "learning_rate": 5.550965281639787e-05, "loss": 0.7138, "step": 10360 }, { "epoch": 2.45, "learning_rate": 5.503297052514498e-05, "loss": 0.721, "step": 10380 }, { "epoch": 2.46, "learning_rate": 5.455628823389211e-05, "loss": 0.7199, "step": 10400 }, { "epoch": 2.46, "eval_loss": 0.728507936000824, "eval_runtime": 19.7761, "eval_samples_per_second": 101.132, "eval_steps_per_second": 3.186, "step": 10400 }, { "epoch": 2.46, "learning_rate": 5.4079605942639226e-05, "loss": 0.7228, "step": 10420 }, { "epoch": 2.47, "learning_rate": 5.3602923651386345e-05, "loss": 0.7193, "step": 10440 }, { "epoch": 2.47, "learning_rate": 5.3126241360133464e-05, "loss": 0.7269, "step": 10460 }, { "epoch": 2.48, "learning_rate": 5.264955906888058e-05, "loss": 0.729, "step": 10480 }, { "epoch": 2.48, "learning_rate": 5.217287677762771e-05, "loss": 0.7193, "step": 10500 }, { "epoch": 2.49, "learning_rate": 5.169619448637483e-05, "loss": 0.7158, "step": 10520 }, { "epoch": 2.49, "learning_rate": 5.121951219512195e-05, "loss": 0.7158, "step": 10540 }, { "epoch": 2.5, "learning_rate": 5.0742829903869065e-05, "loss": 0.7177, "step": 10560 }, { "epoch": 2.5, "learning_rate": 5.0266147612616184e-05, "loss": 0.7187, "step": 10580 }, { "epoch": 2.51, "learning_rate": 4.978946532136331e-05, "loss": 0.7185, "step": 10600 }, { "epoch": 2.51, "eval_loss": 0.7283052802085876, "eval_runtime": 20.2682, "eval_samples_per_second": 98.677, "eval_steps_per_second": 3.108, "step": 10600 }, { "epoch": 2.51, "learning_rate": 4.931278303011042e-05, "loss": 0.7264, "step": 10620 }, { "epoch": 2.52, "learning_rate": 4.883610073885755e-05, "loss": 0.7208, "step": 10640 }, { "epoch": 2.52, "learning_rate": 4.835941844760467e-05, "loss": 0.7275, "step": 10660 }, { "epoch": 2.53, "learning_rate": 4.7882736156351786e-05, "loss": 0.7205, "step": 10680 }, { "epoch": 2.53, "learning_rate": 4.740605386509891e-05, "loss": 0.7213, "step": 10700 }, { "epoch": 2.53, "learning_rate": 4.692937157384602e-05, "loss": 0.7324, "step": 10720 }, { "epoch": 2.54, "learning_rate": 4.645268928259315e-05, "loss": 0.7197, "step": 10740 }, { "epoch": 2.54, "learning_rate": 4.597600699134027e-05, "loss": 0.7162, "step": 10760 }, { "epoch": 2.55, "learning_rate": 4.5499324700087394e-05, "loss": 0.7223, "step": 10780 }, { "epoch": 2.55, "learning_rate": 4.5022642408834506e-05, "loss": 0.7249, "step": 10800 }, { "epoch": 2.55, "eval_loss": 0.7278863191604614, "eval_runtime": 19.7684, "eval_samples_per_second": 101.171, "eval_steps_per_second": 3.187, "step": 10800 }, { "epoch": 2.56, "learning_rate": 4.4545960117581625e-05, "loss": 0.7245, "step": 10820 }, { "epoch": 2.56, "learning_rate": 4.406927782632875e-05, "loss": 0.7298, "step": 10840 }, { "epoch": 2.57, "learning_rate": 4.359259553507587e-05, "loss": 0.7172, "step": 10860 }, { "epoch": 2.57, "learning_rate": 4.3115913243822995e-05, "loss": 0.7183, "step": 10880 }, { "epoch": 2.58, "learning_rate": 4.263923095257011e-05, "loss": 0.7172, "step": 10900 }, { "epoch": 2.58, "learning_rate": 4.2162548661317226e-05, "loss": 0.7166, "step": 10920 }, { "epoch": 2.59, "learning_rate": 4.168586637006435e-05, "loss": 0.7303, "step": 10940 }, { "epoch": 2.59, "learning_rate": 4.1209184078811464e-05, "loss": 0.716, "step": 10960 }, { "epoch": 2.6, "learning_rate": 4.073250178755859e-05, "loss": 0.7199, "step": 10980 }, { "epoch": 2.6, "learning_rate": 4.025581949630571e-05, "loss": 0.7227, "step": 11000 }, { "epoch": 2.6, "eval_loss": 0.7274474501609802, "eval_runtime": 19.9546, "eval_samples_per_second": 100.228, "eval_steps_per_second": 3.157, "step": 11000 }, { "epoch": 2.61, "learning_rate": 3.9779137205052834e-05, "loss": 0.7134, "step": 11020 }, { "epoch": 2.61, "learning_rate": 3.930245491379995e-05, "loss": 0.7354, "step": 11040 }, { "epoch": 2.62, "learning_rate": 3.8825772622547065e-05, "loss": 0.7269, "step": 11060 }, { "epoch": 2.62, "learning_rate": 3.834909033129419e-05, "loss": 0.7261, "step": 11080 }, { "epoch": 2.62, "learning_rate": 3.787240804004131e-05, "loss": 0.735, "step": 11100 }, { "epoch": 2.63, "learning_rate": 3.739572574878843e-05, "loss": 0.716, "step": 11120 }, { "epoch": 2.63, "learning_rate": 3.691904345753555e-05, "loss": 0.721, "step": 11140 }, { "epoch": 2.64, "learning_rate": 3.644236116628267e-05, "loss": 0.7201, "step": 11160 }, { "epoch": 2.64, "learning_rate": 3.596567887502979e-05, "loss": 0.7231, "step": 11180 }, { "epoch": 2.65, "learning_rate": 3.548899658377691e-05, "loss": 0.7172, "step": 11200 }, { "epoch": 2.65, "eval_loss": 0.7270590662956238, "eval_runtime": 19.753, "eval_samples_per_second": 101.251, "eval_steps_per_second": 3.189, "step": 11200 }, { "epoch": 2.65, "learning_rate": 3.501231429252403e-05, "loss": 0.7296, "step": 11220 }, { "epoch": 2.66, "learning_rate": 3.453563200127115e-05, "loss": 0.7239, "step": 11240 }, { "epoch": 2.66, "learning_rate": 3.405894971001827e-05, "loss": 0.7215, "step": 11260 }, { "epoch": 2.67, "learning_rate": 3.358226741876539e-05, "loss": 0.7176, "step": 11280 }, { "epoch": 2.67, "learning_rate": 3.310558512751251e-05, "loss": 0.7277, "step": 11300 }, { "epoch": 2.68, "learning_rate": 3.262890283625963e-05, "loss": 0.7237, "step": 11320 }, { "epoch": 2.68, "learning_rate": 3.215222054500675e-05, "loss": 0.7167, "step": 11340 }, { "epoch": 2.69, "learning_rate": 3.167553825375387e-05, "loss": 0.7184, "step": 11360 }, { "epoch": 2.69, "learning_rate": 3.119885596250099e-05, "loss": 0.7238, "step": 11380 }, { "epoch": 2.7, "learning_rate": 3.072217367124811e-05, "loss": 0.7188, "step": 11400 }, { "epoch": 2.7, "eval_loss": 0.7263159155845642, "eval_runtime": 19.6317, "eval_samples_per_second": 101.876, "eval_steps_per_second": 3.209, "step": 11400 }, { "epoch": 2.7, "learning_rate": 3.0245491379995232e-05, "loss": 0.7146, "step": 11420 }, { "epoch": 2.71, "learning_rate": 2.9768809088742348e-05, "loss": 0.7307, "step": 11440 }, { "epoch": 2.71, "learning_rate": 2.929212679748947e-05, "loss": 0.721, "step": 11460 }, { "epoch": 2.71, "learning_rate": 2.881544450623659e-05, "loss": 0.7293, "step": 11480 }, { "epoch": 2.72, "learning_rate": 2.833876221498371e-05, "loss": 0.7245, "step": 11500 }, { "epoch": 2.72, "learning_rate": 2.7862079923730833e-05, "loss": 0.7264, "step": 11520 }, { "epoch": 2.73, "learning_rate": 2.7385397632477952e-05, "loss": 0.722, "step": 11540 }, { "epoch": 2.73, "learning_rate": 2.6908715341225068e-05, "loss": 0.7195, "step": 11560 }, { "epoch": 2.74, "learning_rate": 2.643203304997219e-05, "loss": 0.7181, "step": 11580 }, { "epoch": 2.74, "learning_rate": 2.5955350758719312e-05, "loss": 0.7225, "step": 11600 }, { "epoch": 2.74, "eval_loss": 0.7265506386756897, "eval_runtime": 19.5252, "eval_samples_per_second": 102.432, "eval_steps_per_second": 3.227, "step": 11600 }, { "epoch": 2.75, "learning_rate": 2.547866846746643e-05, "loss": 0.7151, "step": 11620 }, { "epoch": 2.75, "learning_rate": 2.5001986176213553e-05, "loss": 0.7211, "step": 11640 }, { "epoch": 2.76, "learning_rate": 2.4525303884960672e-05, "loss": 0.7231, "step": 11660 }, { "epoch": 2.76, "learning_rate": 2.404862159370779e-05, "loss": 0.7236, "step": 11680 }, { "epoch": 2.77, "learning_rate": 2.357193930245491e-05, "loss": 0.7161, "step": 11700 }, { "epoch": 2.77, "learning_rate": 2.3095257011202032e-05, "loss": 0.7248, "step": 11720 }, { "epoch": 2.78, "learning_rate": 2.261857471994915e-05, "loss": 0.7195, "step": 11740 }, { "epoch": 2.78, "learning_rate": 2.2141892428696274e-05, "loss": 0.718, "step": 11760 }, { "epoch": 2.79, "learning_rate": 2.1665210137443392e-05, "loss": 0.7161, "step": 11780 }, { "epoch": 2.79, "learning_rate": 2.118852784619051e-05, "loss": 0.7204, "step": 11800 }, { "epoch": 2.79, "eval_loss": 0.7261104583740234, "eval_runtime": 20.0617, "eval_samples_per_second": 99.692, "eval_steps_per_second": 3.14, "step": 11800 }, { "epoch": 2.79, "learning_rate": 2.071184555493763e-05, "loss": 0.716, "step": 11820 }, { "epoch": 2.8, "learning_rate": 2.0235163263684753e-05, "loss": 0.7211, "step": 11840 }, { "epoch": 2.8, "learning_rate": 1.975848097243187e-05, "loss": 0.7242, "step": 11860 }, { "epoch": 2.81, "learning_rate": 1.9281798681178994e-05, "loss": 0.7129, "step": 11880 }, { "epoch": 2.81, "learning_rate": 1.8828950504488756e-05, "loss": 0.7233, "step": 11900 }, { "epoch": 2.82, "learning_rate": 1.8352268213235875e-05, "loss": 0.7286, "step": 11920 }, { "epoch": 2.82, "learning_rate": 1.7875585921982997e-05, "loss": 0.7147, "step": 11940 }, { "epoch": 2.83, "learning_rate": 1.7398903630730116e-05, "loss": 0.7303, "step": 11960 }, { "epoch": 2.83, "learning_rate": 1.692222133947724e-05, "loss": 0.7126, "step": 11980 }, { "epoch": 2.84, "learning_rate": 1.6445539048224358e-05, "loss": 0.7174, "step": 12000 }, { "epoch": 2.84, "eval_loss": 0.7259587645530701, "eval_runtime": 20.6636, "eval_samples_per_second": 96.788, "eval_steps_per_second": 3.049, "step": 12000 }, { "epoch": 2.84, "learning_rate": 1.5968856756971476e-05, "loss": 0.7147, "step": 12020 }, { "epoch": 2.85, "learning_rate": 1.54921744657186e-05, "loss": 0.7184, "step": 12040 }, { "epoch": 2.85, "learning_rate": 1.5015492174465718e-05, "loss": 0.7218, "step": 12060 }, { "epoch": 2.86, "learning_rate": 1.4538809883212837e-05, "loss": 0.7172, "step": 12080 }, { "epoch": 2.86, "learning_rate": 1.4062127591959957e-05, "loss": 0.7326, "step": 12100 }, { "epoch": 2.87, "learning_rate": 1.3585445300707078e-05, "loss": 0.726, "step": 12120 }, { "epoch": 2.87, "learning_rate": 1.3108763009454197e-05, "loss": 0.711, "step": 12140 }, { "epoch": 2.88, "learning_rate": 1.2632080718201317e-05, "loss": 0.7199, "step": 12160 }, { "epoch": 2.88, "learning_rate": 1.215539842694844e-05, "loss": 0.7256, "step": 12180 }, { "epoch": 2.88, "learning_rate": 1.1678716135695557e-05, "loss": 0.7183, "step": 12200 }, { "epoch": 2.88, "eval_loss": 0.7255927324295044, "eval_runtime": 20.0566, "eval_samples_per_second": 99.718, "eval_steps_per_second": 3.141, "step": 12200 }, { "epoch": 2.89, "learning_rate": 1.1202033844442679e-05, "loss": 0.7244, "step": 12220 }, { "epoch": 2.89, "learning_rate": 1.07253515531898e-05, "loss": 0.717, "step": 12240 }, { "epoch": 2.9, "learning_rate": 1.0248669261936918e-05, "loss": 0.7224, "step": 12260 }, { "epoch": 2.9, "learning_rate": 9.771986970684039e-06, "loss": 0.7124, "step": 12280 }, { "epoch": 2.91, "learning_rate": 9.295304679431158e-06, "loss": 0.7285, "step": 12300 }, { "epoch": 2.91, "learning_rate": 8.818622388178278e-06, "loss": 0.7337, "step": 12320 }, { "epoch": 2.92, "learning_rate": 8.341940096925399e-06, "loss": 0.716, "step": 12340 }, { "epoch": 2.92, "learning_rate": 7.865257805672518e-06, "loss": 0.7212, "step": 12360 }, { "epoch": 2.93, "learning_rate": 7.3885755144196385e-06, "loss": 0.7262, "step": 12380 }, { "epoch": 2.93, "learning_rate": 6.911893223166759e-06, "loss": 0.7151, "step": 12400 }, { "epoch": 2.93, "eval_loss": 0.7256051301956177, "eval_runtime": 19.7012, "eval_samples_per_second": 101.516, "eval_steps_per_second": 3.198, "step": 12400 } ], "max_steps": 12687, "num_train_epochs": 3, "total_flos": 1.6118284402370281e+19, "trial_name": null, "trial_params": null }