{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.00029999626535870253, "loss": 1.7113, "step": 10 }, { "epoch": 0.03, "learning_rate": 0.0002999850616207776, "loss": 1.6682, "step": 20 }, { "epoch": 0.04, "learning_rate": 0.00029996638934411774, "loss": 1.6829, "step": 30 }, { "epoch": 0.05, "learning_rate": 0.00029994024945851293, "loss": 1.4132, "step": 40 }, { "epoch": 0.07, "learning_rate": 0.00029990664326560453, "loss": 1.4066, "step": 50 }, { "epoch": 0.08, "learning_rate": 0.0002998655724388202, "loss": 1.5205, "step": 60 }, { "epoch": 0.09, "learning_rate": 0.0002998170390232907, "loss": 1.5003, "step": 70 }, { "epoch": 0.11, "learning_rate": 0.000299761045435748, "loss": 1.4407, "step": 80 }, { "epoch": 0.12, "learning_rate": 0.0002996975944644049, "loss": 1.4323, "step": 90 }, { "epoch": 0.13, "learning_rate": 0.0002996266892688164, "loss": 1.4167, "step": 100 }, { "epoch": 0.15, "learning_rate": 0.00029954833337972206, "loss": 1.4195, "step": 110 }, { "epoch": 0.16, "learning_rate": 0.0002994625306988704, "loss": 1.4177, "step": 120 }, { "epoch": 0.18, "learning_rate": 0.0002993692854988246, "loss": 1.4279, "step": 130 }, { "epoch": 0.19, "learning_rate": 0.0002992686024227496, "loss": 1.3698, "step": 140 }, { "epoch": 0.2, "learning_rate": 0.0002991604864841811, "loss": 1.4032, "step": 150 }, { "epoch": 0.22, "learning_rate": 0.00029904494306677576, "loss": 1.2865, "step": 160 }, { "epoch": 0.23, "learning_rate": 0.00029892197792404313, "loss": 1.2427, "step": 170 }, { "epoch": 0.24, "learning_rate": 0.0002987915971790592, "loss": 1.2646, "step": 180 }, { "epoch": 0.26, "learning_rate": 0.00029865380732416153, "loss": 1.2246, "step": 190 }, { "epoch": 0.27, "learning_rate": 0.00029850861522062586, "loss": 1.2994, "step": 200 }, { "epoch": 0.28, "learning_rate": 0.00029835602809832456, "loss": 1.4174, "step": 210 }, { "epoch": 0.3, "learning_rate": 0.00029819605355536655, "loss": 1.1977, "step": 220 }, { "epoch": 0.31, "learning_rate": 0.0002980286995577189, "loss": 1.2475, "step": 230 }, { "epoch": 0.32, "learning_rate": 0.0002978539744388104, "loss": 1.3046, "step": 240 }, { "epoch": 0.34, "learning_rate": 0.00029767188689911616, "loss": 1.231, "step": 250 }, { "epoch": 0.35, "learning_rate": 0.00029748244600572493, "loss": 1.1623, "step": 260 }, { "epoch": 0.36, "learning_rate": 0.0002972856611918871, "loss": 1.22, "step": 270 }, { "epoch": 0.38, "learning_rate": 0.00029708154225654526, "loss": 1.3153, "step": 280 }, { "epoch": 0.39, "learning_rate": 0.00029687009936384606, "loss": 1.0245, "step": 290 }, { "epoch": 0.4, "learning_rate": 0.0002966513430426344, "loss": 1.1617, "step": 300 }, { "epoch": 0.42, "learning_rate": 0.0002964252841859287, "loss": 1.3038, "step": 310 }, { "epoch": 0.43, "learning_rate": 0.00029619193405037905, "loss": 1.2176, "step": 320 }, { "epoch": 0.44, "learning_rate": 0.000295951304255706, "loss": 1.0473, "step": 330 }, { "epoch": 0.46, "learning_rate": 0.0002957034067841225, "loss": 1.0024, "step": 340 }, { "epoch": 0.47, "learning_rate": 0.00029544825397973706, "loss": 1.0392, "step": 350 }, { "epoch": 0.49, "learning_rate": 0.00029518585854793896, "loss": 1.1253, "step": 360 }, { "epoch": 0.5, "learning_rate": 0.00029491623355476574, "loss": 1.2854, "step": 370 }, { "epoch": 0.51, "learning_rate": 0.0002946393924262526, "loss": 1.2807, "step": 380 }, { "epoch": 0.53, "learning_rate": 0.0002943553489477636, "loss": 1.1513, "step": 390 }, { "epoch": 0.54, "learning_rate": 0.00029406411726330553, "loss": 0.971, "step": 400 }, { "epoch": 0.55, "learning_rate": 0.0002937657118748234, "loss": 0.97, "step": 410 }, { "epoch": 0.57, "learning_rate": 0.00029346014764147836, "loss": 1.0773, "step": 420 }, { "epoch": 0.58, "learning_rate": 0.0002931474397789078, "loss": 1.1714, "step": 430 }, { "epoch": 0.59, "learning_rate": 0.0002928276038584677, "loss": 1.1828, "step": 440 }, { "epoch": 0.61, "learning_rate": 0.0002925006558064572, "loss": 1.0298, "step": 450 }, { "epoch": 0.62, "learning_rate": 0.0002921666119033256, "loss": 1.0366, "step": 460 }, { "epoch": 0.63, "learning_rate": 0.0002918254887828617, "loss": 0.9101, "step": 470 }, { "epoch": 0.65, "learning_rate": 0.0002914773034313653, "loss": 0.9801, "step": 480 }, { "epoch": 0.66, "learning_rate": 0.0002911220731868018, "loss": 0.9764, "step": 490 }, { "epoch": 0.67, "learning_rate": 0.00029075981573793827, "loss": 0.8117, "step": 500 }, { "epoch": 0.69, "learning_rate": 0.00029039054912346313, "loss": 0.9689, "step": 510 }, { "epoch": 0.7, "learning_rate": 0.0002900142917310877, "loss": 1.068, "step": 520 }, { "epoch": 0.71, "learning_rate": 0.00028963106229663063, "loss": 0.9002, "step": 530 }, { "epoch": 0.73, "learning_rate": 0.0002892408799030848, "loss": 0.7903, "step": 540 }, { "epoch": 0.74, "learning_rate": 0.00028884376397966734, "loss": 0.8156, "step": 550 }, { "epoch": 0.75, "learning_rate": 0.00028843973430085204, "loss": 1.0247, "step": 560 }, { "epoch": 0.77, "learning_rate": 0.00028802881098538433, "loss": 0.8413, "step": 570 }, { "epoch": 0.78, "learning_rate": 0.0002876110144952802, "loss": 0.9629, "step": 580 }, { "epoch": 0.8, "learning_rate": 0.00028718636563480654, "loss": 0.9488, "step": 590 }, { "epoch": 0.81, "learning_rate": 0.0002867548855494457, "loss": 0.74, "step": 600 }, { "epoch": 0.82, "learning_rate": 0.00028631659572484223, "loss": 0.8782, "step": 610 }, { "epoch": 0.84, "learning_rate": 0.0002858715179857333, "loss": 0.7538, "step": 620 }, { "epoch": 0.85, "learning_rate": 0.0002854196744948615, "loss": 1.0124, "step": 630 }, { "epoch": 0.86, "learning_rate": 0.00028496108775187177, "loss": 1.0094, "step": 640 }, { "epoch": 0.88, "learning_rate": 0.00028449578059219045, "loss": 0.9834, "step": 650 }, { "epoch": 0.89, "learning_rate": 0.0002840237761858889, "loss": 0.8183, "step": 660 }, { "epoch": 0.9, "learning_rate": 0.00028354509803652894, "loss": 0.7833, "step": 670 }, { "epoch": 0.92, "learning_rate": 0.00028305976997999307, "loss": 1.0735, "step": 680 }, { "epoch": 0.93, "learning_rate": 0.00028256781618329733, "loss": 0.6456, "step": 690 }, { "epoch": 0.94, "learning_rate": 0.0002820692611433879, "loss": 0.8017, "step": 700 }, { "epoch": 0.96, "learning_rate": 0.00028156412968592144, "loss": 0.639, "step": 710 }, { "epoch": 0.97, "learning_rate": 0.0002810524469640285, "loss": 0.9926, "step": 720 }, { "epoch": 0.98, "learning_rate": 0.0002805342384570614, "loss": 0.6367, "step": 730 }, { "epoch": 1.0, "learning_rate": 0.00028000952996932554, "loss": 0.7394, "step": 740 }, { "epoch": 1.01, "learning_rate": 0.0002794783476287939, "loss": 0.5386, "step": 750 }, { "epoch": 1.02, "learning_rate": 0.0002789407178858066, "loss": 0.7581, "step": 760 }, { "epoch": 1.04, "learning_rate": 0.00027839666751175354, "loss": 0.7513, "step": 770 }, { "epoch": 1.05, "learning_rate": 0.0002778462235977413, "loss": 0.9431, "step": 780 }, { "epoch": 1.06, "learning_rate": 0.0002772894135532442, "loss": 0.9494, "step": 790 }, { "epoch": 1.08, "learning_rate": 0.00027672626510473936, "loss": 0.6138, "step": 800 }, { "epoch": 1.09, "learning_rate": 0.0002761568062943261, "loss": 0.8516, "step": 810 }, { "epoch": 1.11, "learning_rate": 0.00027558106547832985, "loss": 0.7844, "step": 820 }, { "epoch": 1.12, "learning_rate": 0.0002749990713258895, "loss": 0.6772, "step": 830 }, { "epoch": 1.13, "learning_rate": 0.00027441085281753024, "loss": 0.6081, "step": 840 }, { "epoch": 1.15, "learning_rate": 0.0002738164392437207, "loss": 0.7722, "step": 850 }, { "epoch": 1.16, "learning_rate": 0.0002732158602034138, "loss": 0.6299, "step": 860 }, { "epoch": 1.17, "learning_rate": 0.00027260914560257345, "loss": 0.6504, "step": 870 }, { "epoch": 1.19, "learning_rate": 0.00027199632565268504, "loss": 0.637, "step": 880 }, { "epoch": 1.2, "learning_rate": 0.0002713774308692511, "loss": 0.7156, "step": 890 }, { "epoch": 1.21, "learning_rate": 0.00027075249207027187, "loss": 0.797, "step": 900 }, { "epoch": 1.23, "learning_rate": 0.00027012154037471065, "loss": 0.8322, "step": 910 }, { "epoch": 1.24, "learning_rate": 0.00026948460720094416, "loss": 0.7509, "step": 920 }, { "epoch": 1.25, "learning_rate": 0.0002688417242651983, "loss": 0.716, "step": 930 }, { "epoch": 1.27, "learning_rate": 0.00026819292357996847, "loss": 0.6985, "step": 940 }, { "epoch": 1.28, "learning_rate": 0.00026753823745242583, "loss": 0.7565, "step": 950 }, { "epoch": 1.29, "learning_rate": 0.0002668776984828083, "loss": 0.9529, "step": 960 }, { "epoch": 1.31, "learning_rate": 0.0002662113395627975, "loss": 0.7075, "step": 970 }, { "epoch": 1.32, "learning_rate": 0.0002655391938738806, "loss": 0.7943, "step": 980 }, { "epoch": 1.33, "learning_rate": 0.00026486129488569824, "loss": 0.8068, "step": 990 }, { "epoch": 1.35, "learning_rate": 0.0002641776763543778, "loss": 0.7974, "step": 1000 }, { "epoch": 1.36, "learning_rate": 0.0002634883723208527, "loss": 0.667, "step": 1010 }, { "epoch": 1.37, "learning_rate": 0.0002627934171091669, "loss": 0.8704, "step": 1020 }, { "epoch": 1.39, "learning_rate": 0.00026209284532476636, "loss": 0.5226, "step": 1030 }, { "epoch": 1.4, "learning_rate": 0.0002613866918527752, "loss": 0.6797, "step": 1040 }, { "epoch": 1.42, "learning_rate": 0.0002606749918562591, "loss": 0.6037, "step": 1050 }, { "epoch": 1.43, "learning_rate": 0.0002599577807744739, "loss": 0.539, "step": 1060 }, { "epoch": 1.44, "learning_rate": 0.0002592350943211014, "loss": 0.628, "step": 1070 }, { "epoch": 1.46, "learning_rate": 0.0002585069684824706, "loss": 0.7272, "step": 1080 }, { "epoch": 1.47, "learning_rate": 0.0002577734395157657, "loss": 0.6978, "step": 1090 }, { "epoch": 1.48, "learning_rate": 0.00025703454394722115, "loss": 0.5347, "step": 1100 }, { "epoch": 1.5, "learning_rate": 0.00025629031857030225, "loss": 0.6832, "step": 1110 }, { "epoch": 1.51, "learning_rate": 0.0002555408004438734, "loss": 0.4517, "step": 1120 }, { "epoch": 1.52, "learning_rate": 0.00025478602689035253, "loss": 0.4694, "step": 1130 }, { "epoch": 1.54, "learning_rate": 0.00025402603549385284, "loss": 0.5531, "step": 1140 }, { "epoch": 1.55, "learning_rate": 0.000253260864098311, "loss": 0.965, "step": 1150 }, { "epoch": 1.56, "learning_rate": 0.00025249055080560297, "loss": 0.5112, "step": 1160 }, { "epoch": 1.58, "learning_rate": 0.0002517151339736464, "loss": 0.6728, "step": 1170 }, { "epoch": 1.59, "learning_rate": 0.00025093465221449115, "loss": 0.7728, "step": 1180 }, { "epoch": 1.6, "learning_rate": 0.0002501491443923959, "loss": 0.5934, "step": 1190 }, { "epoch": 1.62, "learning_rate": 0.0002493586496218933, "loss": 0.6981, "step": 1200 }, { "epoch": 1.63, "learning_rate": 0.0002485632072658423, "loss": 0.5475, "step": 1210 }, { "epoch": 1.64, "learning_rate": 0.0002477628569334679, "loss": 0.691, "step": 1220 }, { "epoch": 1.66, "learning_rate": 0.00024695763847838866, "loss": 0.7188, "step": 1230 }, { "epoch": 1.67, "learning_rate": 0.00024614759199663265, "loss": 0.6256, "step": 1240 }, { "epoch": 1.68, "learning_rate": 0.0002453327578246404, "loss": 0.4491, "step": 1250 }, { "epoch": 1.7, "learning_rate": 0.0002445131765372567, "loss": 0.7007, "step": 1260 }, { "epoch": 1.71, "learning_rate": 0.00024368888894570962, "loss": 0.6256, "step": 1270 }, { "epoch": 1.73, "learning_rate": 0.000242859936095579, "loss": 0.7769, "step": 1280 }, { "epoch": 1.74, "learning_rate": 0.00024202635926475223, "loss": 0.5488, "step": 1290 }, { "epoch": 1.75, "learning_rate": 0.00024118819996136865, "loss": 0.4278, "step": 1300 }, { "epoch": 1.77, "learning_rate": 0.00024034549992175288, "loss": 0.4966, "step": 1310 }, { "epoch": 1.78, "learning_rate": 0.0002394983011083366, "loss": 0.4736, "step": 1320 }, { "epoch": 1.79, "learning_rate": 0.00023864664570756873, "loss": 0.6513, "step": 1330 }, { "epoch": 1.81, "learning_rate": 0.00023779057612781506, "loss": 0.5389, "step": 1340 }, { "epoch": 1.82, "learning_rate": 0.00023693013499724632, "loss": 0.3802, "step": 1350 }, { "epoch": 1.83, "learning_rate": 0.0002360653651617156, "loss": 0.754, "step": 1360 }, { "epoch": 1.85, "learning_rate": 0.00023519630968262477, "loss": 0.6096, "step": 1370 }, { "epoch": 1.86, "learning_rate": 0.00023432301183478018, "loss": 0.5114, "step": 1380 }, { "epoch": 1.87, "learning_rate": 0.00023344551510423808, "loss": 0.6215, "step": 1390 }, { "epoch": 1.89, "learning_rate": 0.00023256386318613873, "loss": 0.5015, "step": 1400 }, { "epoch": 1.9, "learning_rate": 0.00023167809998253102, "loss": 0.6841, "step": 1410 }, { "epoch": 1.91, "learning_rate": 0.00023078826960018612, "loss": 0.7431, "step": 1420 }, { "epoch": 1.93, "learning_rate": 0.00022989441634840128, "loss": 0.6028, "step": 1430 }, { "epoch": 1.94, "learning_rate": 0.00022899658473679344, "loss": 0.6164, "step": 1440 }, { "epoch": 1.95, "learning_rate": 0.00022809481947308276, "loss": 0.7823, "step": 1450 }, { "epoch": 1.97, "learning_rate": 0.0002271891654608665, "loss": 0.5562, "step": 1460 }, { "epoch": 1.98, "learning_rate": 0.00022627966779738306, "loss": 0.6174, "step": 1470 }, { "epoch": 1.99, "learning_rate": 0.00022536637177126615, "loss": 0.671, "step": 1480 }, { "epoch": 2.01, "learning_rate": 0.00022444932286028987, "loss": 0.6333, "step": 1490 }, { "epoch": 2.02, "learning_rate": 0.00022352856672910404, "loss": 0.5254, "step": 1500 }, { "epoch": 2.04, "learning_rate": 0.00022260414922696027, "loss": 0.4731, "step": 1510 }, { "epoch": 2.05, "learning_rate": 0.00022167611638542896, "loss": 0.7305, "step": 1520 }, { "epoch": 2.06, "learning_rate": 0.00022074451441610708, "loss": 0.5911, "step": 1530 }, { "epoch": 2.08, "learning_rate": 0.00021980938970831717, "loss": 0.4527, "step": 1540 }, { "epoch": 2.09, "learning_rate": 0.00021887078882679723, "loss": 0.4437, "step": 1550 }, { "epoch": 2.1, "learning_rate": 0.0002179287585093822, "loss": 0.5298, "step": 1560 }, { "epoch": 2.12, "learning_rate": 0.00021698334566467626, "loss": 0.7712, "step": 1570 }, { "epoch": 2.13, "learning_rate": 0.0002160345973697176, "loss": 0.5333, "step": 1580 }, { "epoch": 2.14, "learning_rate": 0.00021508256086763368, "loss": 0.6186, "step": 1590 }, { "epoch": 2.16, "learning_rate": 0.00021412728356528905, "loss": 0.5444, "step": 1600 }, { "epoch": 2.17, "learning_rate": 0.00021316881303092445, "loss": 0.385, "step": 1610 }, { "epoch": 2.18, "learning_rate": 0.00021220719699178848, "loss": 0.5459, "step": 1620 }, { "epoch": 2.2, "learning_rate": 0.00021124248333176079, "loss": 0.4447, "step": 1630 }, { "epoch": 2.21, "learning_rate": 0.0002102747200889677, "loss": 0.5434, "step": 1640 }, { "epoch": 2.22, "learning_rate": 0.00020930395545339008, "loss": 0.5391, "step": 1650 }, { "epoch": 2.24, "learning_rate": 0.00020833023776446407, "loss": 0.5926, "step": 1660 }, { "epoch": 2.25, "learning_rate": 0.00020735361550867345, "loss": 0.6304, "step": 1670 }, { "epoch": 2.26, "learning_rate": 0.0002063741373171357, "loss": 0.4942, "step": 1680 }, { "epoch": 2.28, "learning_rate": 0.00020539185196318023, "loss": 0.507, "step": 1690 }, { "epoch": 2.29, "learning_rate": 0.00020440680835991969, "loss": 0.4658, "step": 1700 }, { "epoch": 2.3, "learning_rate": 0.00020341905555781433, "loss": 0.4042, "step": 1710 }, { "epoch": 2.32, "learning_rate": 0.00020242864274222955, "loss": 0.5539, "step": 1720 }, { "epoch": 2.33, "learning_rate": 0.0002014356192309868, "loss": 0.4031, "step": 1730 }, { "epoch": 2.35, "learning_rate": 0.00020044003447190756, "loss": 0.4963, "step": 1740 }, { "epoch": 2.36, "learning_rate": 0.00019944193804035117, "loss": 0.3302, "step": 1750 }, { "epoch": 2.37, "learning_rate": 0.00019844137963674643, "loss": 0.5527, "step": 1760 }, { "epoch": 2.39, "learning_rate": 0.0001974384090841164, "loss": 0.485, "step": 1770 }, { "epoch": 2.4, "learning_rate": 0.00019643307632559776, "loss": 0.6018, "step": 1780 }, { "epoch": 2.41, "learning_rate": 0.0001954254314219536, "loss": 0.4818, "step": 1790 }, { "epoch": 2.43, "learning_rate": 0.00019441552454908096, "loss": 0.5312, "step": 1800 }, { "epoch": 2.44, "learning_rate": 0.00019340340599551193, "loss": 0.5679, "step": 1810 }, { "epoch": 2.45, "learning_rate": 0.00019238912615990983, "loss": 0.3859, "step": 1820 }, { "epoch": 2.47, "learning_rate": 0.0001913727355485595, "loss": 0.4745, "step": 1830 }, { "epoch": 2.48, "learning_rate": 0.0001903542847728523, "loss": 0.5523, "step": 1840 }, { "epoch": 2.49, "learning_rate": 0.00018933382454676588, "loss": 0.3833, "step": 1850 }, { "epoch": 2.51, "learning_rate": 0.00018831140568433897, "loss": 0.4132, "step": 1860 }, { "epoch": 2.52, "learning_rate": 0.000187287079097141, "loss": 0.3685, "step": 1870 }, { "epoch": 2.53, "learning_rate": 0.000186260895791737, "loss": 0.6318, "step": 1880 }, { "epoch": 2.55, "learning_rate": 0.00018523290686714756, "loss": 0.4088, "step": 1890 }, { "epoch": 2.56, "learning_rate": 0.0001842031635123046, "loss": 0.5499, "step": 1900 }, { "epoch": 2.57, "learning_rate": 0.00018317171700350224, "loss": 0.4856, "step": 1910 }, { "epoch": 2.59, "learning_rate": 0.0001821386187018435, "loss": 0.6596, "step": 1920 }, { "epoch": 2.6, "learning_rate": 0.00018110392005068286, "loss": 0.3197, "step": 1930 }, { "epoch": 2.61, "learning_rate": 0.00018006767257306447, "loss": 0.3975, "step": 1940 }, { "epoch": 2.63, "learning_rate": 0.00017902992786915663, "loss": 0.3733, "step": 1950 }, { "epoch": 2.64, "learning_rate": 0.00017799073761368234, "loss": 0.4203, "step": 1960 }, { "epoch": 2.65, "learning_rate": 0.00017695015355334624, "loss": 0.4533, "step": 1970 }, { "epoch": 2.67, "learning_rate": 0.00017590822750425774, "loss": 0.3846, "step": 1980 }, { "epoch": 2.68, "learning_rate": 0.0001748650113493508, "loss": 0.4219, "step": 1990 }, { "epoch": 2.7, "learning_rate": 0.0001738205570358006, "loss": 0.2895, "step": 2000 }, { "epoch": 2.71, "learning_rate": 0.00017277491657243668, "loss": 0.3751, "step": 2010 }, { "epoch": 2.72, "learning_rate": 0.000171728142027153, "loss": 0.508, "step": 2020 }, { "epoch": 2.74, "learning_rate": 0.00017068028552431566, "loss": 0.5577, "step": 2030 }, { "epoch": 2.75, "learning_rate": 0.00016963139924216675, "loss": 0.4342, "step": 2040 }, { "epoch": 2.76, "learning_rate": 0.00016858153541022676, "loss": 0.3891, "step": 2050 }, { "epoch": 2.78, "learning_rate": 0.00016753074630669327, "loss": 0.4064, "step": 2060 }, { "epoch": 2.79, "learning_rate": 0.00016647908425583804, "loss": 0.3571, "step": 2070 }, { "epoch": 2.8, "learning_rate": 0.00016542660162540136, "loss": 0.53, "step": 2080 }, { "epoch": 2.82, "learning_rate": 0.00016437335082398455, "loss": 0.4457, "step": 2090 }, { "epoch": 2.83, "learning_rate": 0.00016331938429844022, "loss": 0.4608, "step": 2100 }, { "epoch": 2.84, "learning_rate": 0.0001622647545312604, "loss": 0.4084, "step": 2110 }, { "epoch": 2.86, "learning_rate": 0.00016120951403796364, "loss": 0.6194, "step": 2120 }, { "epoch": 2.87, "learning_rate": 0.0001601537153644795, "loss": 0.5458, "step": 2130 }, { "epoch": 2.88, "learning_rate": 0.00015909741108453243, "loss": 0.3484, "step": 2140 }, { "epoch": 2.9, "learning_rate": 0.00015804065379702352, "loss": 0.2758, "step": 2150 }, { "epoch": 2.91, "learning_rate": 0.00015698349612341156, "loss": 0.2401, "step": 2160 }, { "epoch": 2.92, "learning_rate": 0.00015592599070509265, "loss": 0.4804, "step": 2170 }, { "epoch": 2.94, "learning_rate": 0.00015486819020077886, "loss": 0.4722, "step": 2180 }, { "epoch": 2.95, "learning_rate": 0.0001538101472838762, "loss": 0.4195, "step": 2190 }, { "epoch": 2.96, "learning_rate": 0.00015275191463986159, "loss": 0.4658, "step": 2200 }, { "epoch": 2.98, "learning_rate": 0.00015169354496365948, "loss": 0.5397, "step": 2210 }, { "epoch": 2.99, "learning_rate": 0.0001506350909570179, "loss": 0.4333, "step": 2220 }, { "epoch": 3.01, "learning_rate": 0.0001495766053258841, "loss": 0.4128, "step": 2230 }, { "epoch": 3.02, "learning_rate": 0.00014851814077778016, "loss": 0.4373, "step": 2240 }, { "epoch": 3.03, "learning_rate": 0.00014745975001917812, "loss": 0.3217, "step": 2250 }, { "epoch": 3.05, "learning_rate": 0.00014640148575287593, "loss": 0.3968, "step": 2260 }, { "epoch": 3.06, "learning_rate": 0.0001453434006753726, "loss": 0.398, "step": 2270 }, { "epoch": 3.07, "learning_rate": 0.00014428554747424448, "loss": 0.3392, "step": 2280 }, { "epoch": 3.09, "learning_rate": 0.0001432279788255217, "loss": 0.3295, "step": 2290 }, { "epoch": 3.1, "learning_rate": 0.00014217074739106478, "loss": 0.3666, "step": 2300 }, { "epoch": 3.11, "learning_rate": 0.00014111390581594284, "loss": 0.4743, "step": 2310 }, { "epoch": 3.13, "learning_rate": 0.00014005750672581177, "loss": 0.3309, "step": 2320 }, { "epoch": 3.14, "learning_rate": 0.00013900160272429374, "loss": 0.514, "step": 2330 }, { "epoch": 3.15, "learning_rate": 0.000137946246390358, "loss": 0.3525, "step": 2340 }, { "epoch": 3.17, "learning_rate": 0.00013689149027570246, "loss": 0.3963, "step": 2350 }, { "epoch": 3.18, "learning_rate": 0.00013583738690213718, "loss": 0.268, "step": 2360 }, { "epoch": 3.19, "learning_rate": 0.00013478398875896858, "loss": 0.5475, "step": 2370 }, { "epoch": 3.21, "learning_rate": 0.0001337313483003862, "loss": 0.4491, "step": 2380 }, { "epoch": 3.22, "learning_rate": 0.0001326795179428503, "loss": 0.3173, "step": 2390 }, { "epoch": 3.23, "learning_rate": 0.00013162855006248217, "loss": 0.4052, "step": 2400 }, { "epoch": 3.25, "learning_rate": 0.00013057849699245574, "loss": 0.4724, "step": 2410 }, { "epoch": 3.26, "learning_rate": 0.0001295294110203919, "loss": 0.345, "step": 2420 }, { "epoch": 3.27, "learning_rate": 0.00012848134438575454, "loss": 0.2382, "step": 2430 }, { "epoch": 3.29, "learning_rate": 0.0001274343492772494, "loss": 0.3374, "step": 2440 }, { "epoch": 3.3, "learning_rate": 0.00012638847783022554, "loss": 0.3927, "step": 2450 }, { "epoch": 3.32, "learning_rate": 0.0001253437821240789, "loss": 0.5055, "step": 2460 }, { "epoch": 3.33, "learning_rate": 0.00012430031417965908, "loss": 0.3408, "step": 2470 }, { "epoch": 3.34, "learning_rate": 0.0001232581259566792, "loss": 0.2692, "step": 2480 }, { "epoch": 3.36, "learning_rate": 0.00012221726935112833, "loss": 0.4541, "step": 2490 }, { "epoch": 3.37, "learning_rate": 0.00012117779619268726, "loss": 0.3564, "step": 2500 }, { "epoch": 3.38, "learning_rate": 0.00012013975824214778, "loss": 0.307, "step": 2510 }, { "epoch": 3.4, "learning_rate": 0.00011910320718883525, "loss": 0.3651, "step": 2520 }, { "epoch": 3.41, "learning_rate": 0.00011806819464803458, "loss": 0.4532, "step": 2530 }, { "epoch": 3.42, "learning_rate": 0.00011703477215842013, "loss": 0.324, "step": 2540 }, { "epoch": 3.44, "learning_rate": 0.00011600299117948933, "loss": 0.4484, "step": 2550 }, { "epoch": 3.45, "learning_rate": 0.0001149729030890003, "loss": 0.3729, "step": 2560 }, { "epoch": 3.46, "learning_rate": 0.0001139445591804133, "loss": 0.2915, "step": 2570 }, { "epoch": 3.48, "learning_rate": 0.00011291801066033667, "loss": 0.4071, "step": 2580 }, { "epoch": 3.49, "learning_rate": 0.00011189330864597714, "loss": 0.3065, "step": 2590 }, { "epoch": 3.5, "learning_rate": 0.00011087050416259409, "loss": 0.34, "step": 2600 }, { "epoch": 3.52, "learning_rate": 0.00010984964814095903, "loss": 0.3662, "step": 2610 }, { "epoch": 3.53, "learning_rate": 0.00010883079141481938, "loss": 0.2805, "step": 2620 }, { "epoch": 3.54, "learning_rate": 0.0001078139847183673, "loss": 0.3726, "step": 2630 }, { "epoch": 3.56, "learning_rate": 0.00010679927868371316, "loss": 0.3502, "step": 2640 }, { "epoch": 3.57, "learning_rate": 0.00010578672383836435, "loss": 0.3235, "step": 2650 }, { "epoch": 3.58, "learning_rate": 0.00010477637060270957, "loss": 0.3663, "step": 2660 }, { "epoch": 3.6, "learning_rate": 0.00010376826928750763, "loss": 0.3284, "step": 2670 }, { "epoch": 3.61, "learning_rate": 0.0001027624700913826, "loss": 0.322, "step": 2680 }, { "epoch": 3.63, "learning_rate": 0.000101759023098324, "loss": 0.236, "step": 2690 }, { "epoch": 3.64, "learning_rate": 0.00010075797827519295, "loss": 0.2304, "step": 2700 }, { "epoch": 3.65, "learning_rate": 9.975938546923396e-05, "loss": 0.4279, "step": 2710 }, { "epoch": 3.67, "learning_rate": 9.876329440559268e-05, "loss": 0.3311, "step": 2720 }, { "epoch": 3.68, "learning_rate": 9.776975468484019e-05, "loss": 0.3676, "step": 2730 }, { "epoch": 3.69, "learning_rate": 9.677881578050272e-05, "loss": 0.2316, "step": 2740 }, { "epoch": 3.71, "learning_rate": 9.579052703659831e-05, "loss": 0.345, "step": 2750 }, { "epoch": 3.72, "learning_rate": 9.480493766517982e-05, "loss": 0.4248, "step": 2760 }, { "epoch": 3.73, "learning_rate": 9.382209674388407e-05, "loss": 0.3895, "step": 2770 }, { "epoch": 3.75, "learning_rate": 9.284205321348839e-05, "loss": 0.2675, "step": 2780 }, { "epoch": 3.76, "learning_rate": 9.186485587547324e-05, "loss": 0.4064, "step": 2790 }, { "epoch": 3.77, "learning_rate": 9.08905533895925e-05, "loss": 0.3509, "step": 2800 }, { "epoch": 3.79, "learning_rate": 8.991919427145014e-05, "loss": 0.3982, "step": 2810 }, { "epoch": 3.8, "learning_rate": 8.895082689008442e-05, "loss": 0.3352, "step": 2820 }, { "epoch": 3.81, "learning_rate": 8.798549946555971e-05, "loss": 0.3933, "step": 2830 }, { "epoch": 3.83, "learning_rate": 8.702326006656477e-05, "loss": 0.4379, "step": 2840 }, { "epoch": 3.84, "learning_rate": 8.606415660801956e-05, "loss": 0.2152, "step": 2850 }, { "epoch": 3.85, "learning_rate": 8.510823684868922e-05, "loss": 0.2329, "step": 2860 }, { "epoch": 3.87, "learning_rate": 8.415554838880595e-05, "loss": 0.3286, "step": 2870 }, { "epoch": 3.88, "learning_rate": 8.320613866769852e-05, "loss": 0.3072, "step": 2880 }, { "epoch": 3.89, "learning_rate": 8.22600549614303e-05, "loss": 0.3184, "step": 2890 }, { "epoch": 3.91, "learning_rate": 8.131734438044519e-05, "loss": 0.4206, "step": 2900 }, { "epoch": 3.92, "learning_rate": 8.037805386722124e-05, "loss": 0.4384, "step": 2910 }, { "epoch": 3.94, "learning_rate": 7.944223019393373e-05, "loss": 0.2849, "step": 2920 }, { "epoch": 3.95, "learning_rate": 7.850991996012589e-05, "loss": 0.1819, "step": 2930 }, { "epoch": 3.96, "learning_rate": 7.758116959038828e-05, "loss": 0.3399, "step": 2940 }, { "epoch": 3.98, "learning_rate": 7.665602533204745e-05, "loss": 0.3788, "step": 2950 }, { "epoch": 3.99, "learning_rate": 7.573453325286273e-05, "loss": 0.442, "step": 2960 }, { "epoch": 4.0, "learning_rate": 7.481673923873248e-05, "loss": 0.37, "step": 2970 }, { "epoch": 4.02, "learning_rate": 7.390268899140912e-05, "loss": 0.3434, "step": 2980 }, { "epoch": 4.03, "learning_rate": 7.299242802622322e-05, "loss": 0.26, "step": 2990 }, { "epoch": 4.04, "learning_rate": 7.208600166981743e-05, "loss": 0.3701, "step": 3000 }, { "epoch": 4.06, "learning_rate": 7.118345505788912e-05, "loss": 0.3749, "step": 3010 }, { "epoch": 4.07, "learning_rate": 7.028483313294289e-05, "loss": 0.2764, "step": 3020 }, { "epoch": 4.08, "learning_rate": 6.939018064205281e-05, "loss": 0.3949, "step": 3030 }, { "epoch": 4.1, "learning_rate": 6.849954213463407e-05, "loss": 0.3791, "step": 3040 }, { "epoch": 4.11, "learning_rate": 6.761296196022468e-05, "loss": 0.3701, "step": 3050 }, { "epoch": 4.12, "learning_rate": 6.673048426627714e-05, "loss": 0.3115, "step": 3060 }, { "epoch": 4.14, "learning_rate": 6.585215299595985e-05, "loss": 0.4056, "step": 3070 }, { "epoch": 4.15, "learning_rate": 6.497801188596934e-05, "loss": 0.2566, "step": 3080 }, { "epoch": 4.16, "learning_rate": 6.410810446435216e-05, "loss": 0.3565, "step": 3090 }, { "epoch": 4.18, "learning_rate": 6.324247404833736e-05, "loss": 0.222, "step": 3100 }, { "epoch": 4.19, "learning_rate": 6.23811637421796e-05, "loss": 0.1795, "step": 3110 }, { "epoch": 4.2, "learning_rate": 6.152421643501283e-05, "loss": 0.316, "step": 3120 }, { "epoch": 4.22, "learning_rate": 6.0671674798714305e-05, "loss": 0.3918, "step": 3130 }, { "epoch": 4.23, "learning_rate": 5.9823581285780096e-05, "loss": 0.2051, "step": 3140 }, { "epoch": 4.25, "learning_rate": 5.897997812721103e-05, "loss": 0.2124, "step": 3150 }, { "epoch": 4.26, "learning_rate": 5.814090733040956e-05, "loss": 0.2791, "step": 3160 }, { "epoch": 4.27, "learning_rate": 5.7306410677088524e-05, "loss": 0.2585, "step": 3170 }, { "epoch": 4.29, "learning_rate": 5.6476529721189974e-05, "loss": 0.3343, "step": 3180 }, { "epoch": 4.3, "learning_rate": 5.565130578681649e-05, "loss": 0.3006, "step": 3190 }, { "epoch": 4.31, "learning_rate": 5.483077996617325e-05, "loss": 0.309, "step": 3200 }, { "epoch": 4.33, "learning_rate": 5.4014993117521686e-05, "loss": 0.2552, "step": 3210 }, { "epoch": 4.34, "learning_rate": 5.3203985863145255e-05, "loss": 0.2918, "step": 3220 }, { "epoch": 4.35, "learning_rate": 5.23977985873264e-05, "loss": 0.2198, "step": 3230 }, { "epoch": 4.37, "learning_rate": 5.159647143433575e-05, "loss": 0.2432, "step": 3240 }, { "epoch": 4.38, "learning_rate": 5.080004430643297e-05, "loss": 0.2466, "step": 3250 }, { "epoch": 4.39, "learning_rate": 5.000855686188001e-05, "loss": 0.2845, "step": 3260 }, { "epoch": 4.41, "learning_rate": 4.9222048512966096e-05, "loss": 0.2725, "step": 3270 }, { "epoch": 4.42, "learning_rate": 4.844055842404539e-05, "loss": 0.2334, "step": 3280 }, { "epoch": 4.43, "learning_rate": 4.766412550958674e-05, "loss": 0.2995, "step": 3290 }, { "epoch": 4.45, "learning_rate": 4.689278843223571e-05, "loss": 0.2107, "step": 3300 }, { "epoch": 4.46, "learning_rate": 4.6126585600889834e-05, "loss": 0.2346, "step": 3310 }, { "epoch": 4.47, "learning_rate": 4.536555516878547e-05, "loss": 0.1997, "step": 3320 }, { "epoch": 4.49, "learning_rate": 4.4609735031598425e-05, "loss": 0.3414, "step": 3330 }, { "epoch": 4.5, "learning_rate": 4.3859162825556675e-05, "loss": 0.2281, "step": 3340 }, { "epoch": 4.51, "learning_rate": 4.311387592556626e-05, "loss": 0.269, "step": 3350 }, { "epoch": 4.53, "learning_rate": 4.237391144335031e-05, "loss": 0.3307, "step": 3360 }, { "epoch": 4.54, "learning_rate": 4.163930622560111e-05, "loss": 0.3806, "step": 3370 }, { "epoch": 4.56, "learning_rate": 4.0910096852145024e-05, "loss": 0.3077, "step": 3380 }, { "epoch": 4.57, "learning_rate": 4.018631963412126e-05, "loss": 0.2362, "step": 3390 }, { "epoch": 4.58, "learning_rate": 3.946801061217374e-05, "loss": 0.2585, "step": 3400 }, { "epoch": 4.6, "learning_rate": 3.8755205554656207e-05, "loss": 0.2587, "step": 3410 }, { "epoch": 4.61, "learning_rate": 3.804793995585142e-05, "loss": 0.3041, "step": 3420 }, { "epoch": 4.62, "learning_rate": 3.734624903420356e-05, "loss": 0.2408, "step": 3430 }, { "epoch": 4.64, "learning_rate": 3.6650167730564575e-05, "loss": 0.4386, "step": 3440 }, { "epoch": 4.65, "learning_rate": 3.595973070645425e-05, "loss": 0.253, "step": 3450 }, { "epoch": 4.66, "learning_rate": 3.5274972342334166e-05, "loss": 0.2513, "step": 3460 }, { "epoch": 4.68, "learning_rate": 3.459592673589587e-05, "loss": 0.2299, "step": 3470 }, { "epoch": 4.69, "learning_rate": 3.392262770036299e-05, "loss": 0.3081, "step": 3480 }, { "epoch": 4.7, "learning_rate": 3.325510876280718e-05, "loss": 0.2529, "step": 3490 }, { "epoch": 4.72, "learning_rate": 3.2593403162479026e-05, "loss": 0.2634, "step": 3500 }, { "epoch": 4.73, "learning_rate": 3.19375438491527e-05, "loss": 0.2449, "step": 3510 }, { "epoch": 4.74, "learning_rate": 3.128756348148522e-05, "loss": 0.2089, "step": 3520 }, { "epoch": 4.76, "learning_rate": 3.0643494425390255e-05, "loss": 0.2856, "step": 3530 }, { "epoch": 4.77, "learning_rate": 3.0005368752426416e-05, "loss": 0.3153, "step": 3540 }, { "epoch": 4.78, "learning_rate": 2.937321823820019e-05, "loss": 0.4003, "step": 3550 }, { "epoch": 4.8, "learning_rate": 2.8747074360783838e-05, "loss": 0.2499, "step": 3560 }, { "epoch": 4.81, "learning_rate": 2.81269682991478e-05, "loss": 0.3115, "step": 3570 }, { "epoch": 4.82, "learning_rate": 2.7512930931608144e-05, "loss": 0.239, "step": 3580 }, { "epoch": 4.84, "learning_rate": 2.690499283428909e-05, "loss": 0.2655, "step": 3590 }, { "epoch": 4.85, "learning_rate": 2.630318427960018e-05, "loss": 0.2439, "step": 3600 }, { "epoch": 4.87, "learning_rate": 2.570753523472923e-05, "loss": 0.2491, "step": 3610 }, { "epoch": 4.88, "learning_rate": 2.5118075360149886e-05, "loss": 0.2485, "step": 3620 }, { "epoch": 4.89, "learning_rate": 2.4534834008144632e-05, "loss": 0.1787, "step": 3630 }, { "epoch": 4.91, "learning_rate": 2.3957840221343372e-05, "loss": 0.379, "step": 3640 }, { "epoch": 4.92, "learning_rate": 2.3387122731277074e-05, "loss": 0.274, "step": 3650 }, { "epoch": 4.93, "learning_rate": 2.2822709956947194e-05, "loss": 0.287, "step": 3660 }, { "epoch": 4.95, "learning_rate": 2.2264630003410492e-05, "loss": 0.2741, "step": 3670 }, { "epoch": 4.96, "learning_rate": 2.1712910660379474e-05, "loss": 0.1208, "step": 3680 }, { "epoch": 4.97, "learning_rate": 2.1167579400838735e-05, "loss": 0.2456, "step": 3690 }, { "epoch": 4.99, "learning_rate": 2.062866337967685e-05, "loss": 0.2456, "step": 3700 }, { "epoch": 5.0, "learning_rate": 2.009618943233419e-05, "loss": 0.2643, "step": 3710 } ], "logging_steps": 10, "max_steps": 4452, "num_train_epochs": 6, "save_steps": 500, "total_flos": 1.93884008030208e+16, "trial_name": null, "trial_params": null }