{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0055710306406684, "eval_steps": 50, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.5e-06, "loss": 0.2773, "step": 5 }, { "epoch": 0.02, "learning_rate": 5e-06, "loss": 0.3981, "step": 10 }, { "epoch": 0.03, "learning_rate": 7.5e-06, "loss": 0.3499, "step": 15 }, { "epoch": 0.03, "learning_rate": 1e-05, "loss": 0.3669, "step": 20 }, { "epoch": 0.04, "learning_rate": 1.25e-05, "loss": 0.5399, "step": 25 }, { "epoch": 0.05, "learning_rate": 1.5e-05, "loss": 0.3177, "step": 30 }, { "epoch": 0.06, "learning_rate": 1.75e-05, "loss": 0.3217, "step": 35 }, { "epoch": 0.07, "learning_rate": 2e-05, "loss": 0.3754, "step": 40 }, { "epoch": 0.08, "learning_rate": 2.25e-05, "loss": 0.352, "step": 45 }, { "epoch": 0.08, "learning_rate": 2.5e-05, "loss": 0.2789, "step": 50 }, { "epoch": 0.08, "eval_loss": 0.3447761535644531, "eval_runtime": 177.2113, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 50 }, { "epoch": 0.09, "learning_rate": 2.7500000000000004e-05, "loss": 0.388, "step": 55 }, { "epoch": 0.1, "learning_rate": 3e-05, "loss": 0.4127, "step": 60 }, { "epoch": 0.11, "learning_rate": 3.2500000000000004e-05, "loss": 0.374, "step": 65 }, { "epoch": 0.12, "learning_rate": 3.5e-05, "loss": 0.4176, "step": 70 }, { "epoch": 0.13, "learning_rate": 3.7500000000000003e-05, "loss": 0.2909, "step": 75 }, { "epoch": 0.13, "learning_rate": 4e-05, "loss": 0.3463, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.25e-05, "loss": 0.4037, "step": 85 }, { "epoch": 0.15, "learning_rate": 4.5e-05, "loss": 0.2081, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.75e-05, "loss": 0.29, "step": 95 }, { "epoch": 0.17, "learning_rate": 5e-05, "loss": 0.1879, "step": 100 }, { "epoch": 0.17, "eval_loss": 0.31108421087265015, "eval_runtime": 177.3154, "eval_samples_per_second": 2.25, "eval_steps_per_second": 1.128, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9999412890861116e-05, "loss": 0.3191, "step": 105 }, { "epoch": 0.18, "learning_rate": 4.9997651591020244e-05, "loss": 0.3487, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.999471618320339e-05, "loss": 0.2979, "step": 115 }, { "epoch": 0.2, "learning_rate": 4.999060680528294e-05, "loss": 0.4943, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.9985323650271165e-05, "loss": 0.3237, "step": 125 }, { "epoch": 0.22, "learning_rate": 4.997886696631114e-05, "loss": 0.3702, "step": 130 }, { "epoch": 0.23, "learning_rate": 4.997123705666514e-05, "loss": 0.3141, "step": 135 }, { "epoch": 0.23, "learning_rate": 4.9962434279700316e-05, "loss": 0.3956, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.9952459048871945e-05, "loss": 0.2972, "step": 145 }, { "epoch": 0.25, "learning_rate": 4.9941311832703954e-05, "loss": 0.323, "step": 150 }, { "epoch": 0.25, "eval_loss": 0.2948896288871765, "eval_runtime": 177.2222, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.9928993154766957e-05, "loss": 0.3052, "step": 155 }, { "epoch": 0.27, "learning_rate": 4.99155035936536e-05, "loss": 0.3417, "step": 160 }, { "epoch": 0.28, "learning_rate": 4.9900843782951475e-05, "loss": 0.2436, "step": 165 }, { "epoch": 0.28, "learning_rate": 4.9885014411213285e-05, "loss": 0.274, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.986801622192453e-05, "loss": 0.3599, "step": 175 }, { "epoch": 0.3, "learning_rate": 4.9849850013468585e-05, "loss": 0.3956, "step": 180 }, { "epoch": 0.31, "learning_rate": 4.983051663908922e-05, "loss": 0.2769, "step": 185 }, { "epoch": 0.32, "learning_rate": 4.98100170068505e-05, "loss": 0.3369, "step": 190 }, { "epoch": 0.33, "learning_rate": 4.9788352079594136e-05, "loss": 0.4247, "step": 195 }, { "epoch": 0.33, "learning_rate": 4.976552287489426e-05, "loss": 0.3049, "step": 200 }, { "epoch": 0.33, "eval_loss": 0.28148066997528076, "eval_runtime": 177.3094, "eval_samples_per_second": 2.25, "eval_steps_per_second": 1.128, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.974153046500967e-05, "loss": 0.3088, "step": 205 }, { "epoch": 0.35, "learning_rate": 4.9716375976833396e-05, "loss": 0.34, "step": 210 }, { "epoch": 0.36, "learning_rate": 4.969006059183984e-05, "loss": 0.2245, "step": 215 }, { "epoch": 0.37, "learning_rate": 4.9662585546029246e-05, "loss": 0.4144, "step": 220 }, { "epoch": 0.38, "learning_rate": 4.963395212986964e-05, "loss": 0.3311, "step": 225 }, { "epoch": 0.38, "learning_rate": 4.960416168823626e-05, "loss": 0.2514, "step": 230 }, { "epoch": 0.39, "learning_rate": 4.957321562034833e-05, "loss": 0.3464, "step": 235 }, { "epoch": 0.4, "learning_rate": 4.954111537970342e-05, "loss": 0.24, "step": 240 }, { "epoch": 0.41, "learning_rate": 4.950786247400908e-05, "loss": 0.2474, "step": 245 }, { "epoch": 0.42, "learning_rate": 4.94734584651121e-05, "loss": 0.2993, "step": 250 }, { "epoch": 0.42, "eval_loss": 0.2721606492996216, "eval_runtime": 177.2024, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 250 }, { "epoch": 0.43, "learning_rate": 4.943790496892513e-05, "loss": 0.2876, "step": 255 }, { "epoch": 0.43, "learning_rate": 4.9401203655350766e-05, "loss": 0.396, "step": 260 }, { "epoch": 0.44, "learning_rate": 4.936335624820313e-05, "loss": 0.2472, "step": 265 }, { "epoch": 0.45, "learning_rate": 4.932436452512693e-05, "loss": 0.2673, "step": 270 }, { "epoch": 0.46, "learning_rate": 4.9284230317513906e-05, "loss": 0.2537, "step": 275 }, { "epoch": 0.47, "learning_rate": 4.9242955510416877e-05, "loss": 0.2601, "step": 280 }, { "epoch": 0.48, "learning_rate": 4.920054204246115e-05, "loss": 0.2689, "step": 285 }, { "epoch": 0.48, "learning_rate": 4.915699190575349e-05, "loss": 0.2658, "step": 290 }, { "epoch": 0.49, "learning_rate": 4.911230714578858e-05, "loss": 0.2916, "step": 295 }, { "epoch": 0.5, "learning_rate": 4.906648986135287e-05, "loss": 0.1683, "step": 300 }, { "epoch": 0.5, "eval_loss": 0.2587164044380188, "eval_runtime": 177.1919, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 300 }, { "epoch": 0.51, "learning_rate": 4.901954220442609e-05, "loss": 0.2327, "step": 305 }, { "epoch": 0.52, "learning_rate": 4.897146638008012e-05, "loss": 0.1792, "step": 310 }, { "epoch": 0.53, "learning_rate": 4.89222646463754e-05, "loss": 0.2767, "step": 315 }, { "epoch": 0.53, "learning_rate": 4.8871939314254965e-05, "loss": 0.2448, "step": 320 }, { "epoch": 0.54, "learning_rate": 4.8820492747435773e-05, "loss": 0.3314, "step": 325 }, { "epoch": 0.55, "learning_rate": 4.8767927362297816e-05, "loss": 0.2794, "step": 330 }, { "epoch": 0.56, "learning_rate": 4.871424562777052e-05, "loss": 0.3167, "step": 335 }, { "epoch": 0.57, "learning_rate": 4.865945006521684e-05, "loss": 0.2845, "step": 340 }, { "epoch": 0.58, "learning_rate": 4.860354324831482e-05, "loss": 0.2463, "step": 345 }, { "epoch": 0.58, "learning_rate": 4.854652780293672e-05, "loss": 0.4197, "step": 350 }, { "epoch": 0.58, "eval_loss": 0.25140881538391113, "eval_runtime": 177.2007, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 350 }, { "epoch": 0.59, "learning_rate": 4.848840640702564e-05, "loss": 0.1682, "step": 355 }, { "epoch": 0.6, "learning_rate": 4.8429181790469824e-05, "loss": 0.1677, "step": 360 }, { "epoch": 0.61, "learning_rate": 4.836885673497434e-05, "loss": 0.2691, "step": 365 }, { "epoch": 0.62, "learning_rate": 4.830743407393051e-05, "loss": 0.2319, "step": 370 }, { "epoch": 0.63, "learning_rate": 4.8244916692282786e-05, "loss": 0.3504, "step": 375 }, { "epoch": 0.64, "learning_rate": 4.818130752639326e-05, "loss": 0.2515, "step": 380 }, { "epoch": 0.64, "learning_rate": 4.8116609563903725e-05, "loss": 0.2558, "step": 385 }, { "epoch": 0.65, "learning_rate": 4.8050825843595395e-05, "loss": 0.2106, "step": 390 }, { "epoch": 0.66, "learning_rate": 4.798395945524614e-05, "loss": 0.2607, "step": 395 }, { "epoch": 0.67, "learning_rate": 4.791601353948537e-05, "loss": 0.2709, "step": 400 }, { "epoch": 0.67, "eval_loss": 0.24746671319007874, "eval_runtime": 177.2021, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 400 }, { "epoch": 0.68, "learning_rate": 4.784699128764654e-05, "loss": 0.2166, "step": 405 }, { "epoch": 0.69, "learning_rate": 4.777689594161724e-05, "loss": 0.194, "step": 410 }, { "epoch": 0.69, "learning_rate": 4.770573079368691e-05, "loss": 0.2225, "step": 415 }, { "epoch": 0.7, "learning_rate": 4.763349918639227e-05, "loss": 0.3093, "step": 420 }, { "epoch": 0.71, "learning_rate": 4.756020451236025e-05, "loss": 0.2138, "step": 425 }, { "epoch": 0.72, "learning_rate": 4.748585021414869e-05, "loss": 0.2779, "step": 430 }, { "epoch": 0.73, "learning_rate": 4.7410439784084626e-05, "loss": 0.1564, "step": 435 }, { "epoch": 0.74, "learning_rate": 4.7333976764100275e-05, "loss": 0.1869, "step": 440 }, { "epoch": 0.74, "learning_rate": 4.725646474556665e-05, "loss": 0.3141, "step": 445 }, { "epoch": 0.75, "learning_rate": 4.717790736912493e-05, "loss": 0.3625, "step": 450 }, { "epoch": 0.75, "eval_loss": 0.24145641922950745, "eval_runtime": 177.2307, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.128, "step": 450 }, { "epoch": 0.76, "learning_rate": 4.709830832451538e-05, "loss": 0.1778, "step": 455 }, { "epoch": 0.77, "learning_rate": 4.701767135040414e-05, "loss": 0.2132, "step": 460 }, { "epoch": 0.78, "learning_rate": 4.693600023420758e-05, "loss": 0.2449, "step": 465 }, { "epoch": 0.79, "learning_rate": 4.685329881191436e-05, "loss": 0.1909, "step": 470 }, { "epoch": 0.79, "learning_rate": 4.676957096790536e-05, "loss": 0.2419, "step": 475 }, { "epoch": 0.8, "learning_rate": 4.668482063477118e-05, "loss": 0.2918, "step": 480 }, { "epoch": 0.81, "learning_rate": 4.659905179312742e-05, "loss": 0.3548, "step": 485 }, { "epoch": 0.82, "learning_rate": 4.6512268471427745e-05, "loss": 0.1435, "step": 490 }, { "epoch": 0.83, "learning_rate": 4.642447474577466e-05, "loss": 0.2817, "step": 495 }, { "epoch": 0.84, "learning_rate": 4.6335674739728055e-05, "loss": 0.2872, "step": 500 }, { "epoch": 0.84, "eval_loss": 0.23039554059505463, "eval_runtime": 177.2172, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 500 }, { "epoch": 0.84, "learning_rate": 4.624587262411153e-05, "loss": 0.2396, "step": 505 }, { "epoch": 0.85, "learning_rate": 4.615507261681651e-05, "loss": 0.177, "step": 510 }, { "epoch": 0.86, "learning_rate": 4.606327898260413e-05, "loss": 0.3332, "step": 515 }, { "epoch": 0.87, "learning_rate": 4.597049603290491e-05, "loss": 0.246, "step": 520 }, { "epoch": 0.88, "learning_rate": 4.5876728125616264e-05, "loss": 0.3666, "step": 525 }, { "epoch": 0.89, "learning_rate": 4.578197966489781e-05, "loss": 0.2239, "step": 530 }, { "epoch": 0.89, "learning_rate": 4.568625510096454e-05, "loss": 0.246, "step": 535 }, { "epoch": 0.9, "learning_rate": 4.5589558929877736e-05, "loss": 0.2086, "step": 540 }, { "epoch": 0.91, "learning_rate": 4.5491895693333876e-05, "loss": 0.2635, "step": 545 }, { "epoch": 0.92, "learning_rate": 4.5393269978451234e-05, "loss": 0.1746, "step": 550 }, { "epoch": 0.92, "eval_loss": 0.2252800166606903, "eval_runtime": 177.2253, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 550 }, { "epoch": 0.93, "learning_rate": 4.5293686417554525e-05, "loss": 0.2149, "step": 555 }, { "epoch": 0.94, "learning_rate": 4.519314968795722e-05, "loss": 0.1499, "step": 560 }, { "epoch": 0.94, "learning_rate": 4.5091664511741944e-05, "loss": 0.418, "step": 565 }, { "epoch": 0.95, "learning_rate": 4.4989235655538654e-05, "loss": 0.1811, "step": 570 }, { "epoch": 0.96, "learning_rate": 4.4885867930300744e-05, "loss": 0.308, "step": 575 }, { "epoch": 0.97, "learning_rate": 4.478156619107912e-05, "loss": 0.1722, "step": 580 }, { "epoch": 0.98, "learning_rate": 4.4676335336794124e-05, "loss": 0.2718, "step": 585 }, { "epoch": 0.99, "learning_rate": 4.457018031000545e-05, "loss": 0.2606, "step": 590 }, { "epoch": 0.99, "learning_rate": 4.446310609668e-05, "loss": 0.1796, "step": 595 }, { "epoch": 1.0, "learning_rate": 4.435511772595773e-05, "loss": 0.1841, "step": 600 }, { "epoch": 1.0, "eval_loss": 0.21893376111984253, "eval_runtime": 177.1964, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 600 }, { "epoch": 1.01, "learning_rate": 4.424622026991536e-05, "loss": 0.0985, "step": 605 }, { "epoch": 1.02, "learning_rate": 4.4136418843328244e-05, "loss": 0.2228, "step": 610 }, { "epoch": 1.03, "learning_rate": 4.402571860343006e-05, "loss": 0.1185, "step": 615 }, { "epoch": 1.04, "learning_rate": 4.39141247496706e-05, "loss": 0.1903, "step": 620 }, { "epoch": 1.04, "learning_rate": 4.3801642523471585e-05, "loss": 0.0503, "step": 625 }, { "epoch": 1.05, "learning_rate": 4.3688277207980446e-05, "loss": 0.1138, "step": 630 }, { "epoch": 1.06, "learning_rate": 4.357403412782219e-05, "loss": 0.1712, "step": 635 }, { "epoch": 1.07, "learning_rate": 4.345891864884937e-05, "loss": 0.1137, "step": 640 }, { "epoch": 1.08, "learning_rate": 4.334293617788992e-05, "loss": 0.1466, "step": 645 }, { "epoch": 1.09, "learning_rate": 4.322609216249336e-05, "loss": 0.1264, "step": 650 }, { "epoch": 1.09, "eval_loss": 0.22615452110767365, "eval_runtime": 177.2186, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 650 }, { "epoch": 1.09, "learning_rate": 4.3108392090674816e-05, "loss": 0.1117, "step": 655 }, { "epoch": 1.1, "learning_rate": 4.2989841490657325e-05, "loss": 0.212, "step": 660 }, { "epoch": 1.11, "learning_rate": 4.287044593061213e-05, "loss": 0.1206, "step": 665 }, { "epoch": 1.12, "learning_rate": 4.27502110183972e-05, "loss": 0.2377, "step": 670 }, { "epoch": 1.13, "learning_rate": 4.262914240129379e-05, "loss": 0.1839, "step": 675 }, { "epoch": 1.14, "learning_rate": 4.250724576574122e-05, "loss": 0.1335, "step": 680 }, { "epoch": 1.14, "learning_rate": 4.2384526837069785e-05, "loss": 0.0865, "step": 685 }, { "epoch": 1.15, "learning_rate": 4.226099137923186e-05, "loss": 0.164, "step": 690 }, { "epoch": 1.16, "learning_rate": 4.213664519453114e-05, "loss": 0.1225, "step": 695 }, { "epoch": 1.17, "learning_rate": 4.201149412335015e-05, "loss": 0.0872, "step": 700 }, { "epoch": 1.17, "eval_loss": 0.22944454848766327, "eval_runtime": 177.2171, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 700 }, { "epoch": 1.18, "learning_rate": 4.188554404387588e-05, "loss": 0.169, "step": 705 }, { "epoch": 1.19, "learning_rate": 4.1758800871823756e-05, "loss": 0.2083, "step": 710 }, { "epoch": 1.19, "learning_rate": 4.163127056015975e-05, "loss": 0.1313, "step": 715 }, { "epoch": 1.2, "learning_rate": 4.150295909882077e-05, "loss": 0.1935, "step": 720 }, { "epoch": 1.21, "learning_rate": 4.1373872514433355e-05, "loss": 0.097, "step": 725 }, { "epoch": 1.22, "learning_rate": 4.124401687003057e-05, "loss": 0.1333, "step": 730 }, { "epoch": 1.23, "learning_rate": 4.111339826476725e-05, "loss": 0.1156, "step": 735 }, { "epoch": 1.24, "learning_rate": 4.098202283363356e-05, "loss": 0.1681, "step": 740 }, { "epoch": 1.25, "learning_rate": 4.084989674716679e-05, "loss": 0.2055, "step": 745 }, { "epoch": 1.25, "learning_rate": 4.071702621116158e-05, "loss": 0.1379, "step": 750 }, { "epoch": 1.25, "eval_loss": 0.22864395380020142, "eval_runtime": 177.2169, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 750 }, { "epoch": 1.26, "learning_rate": 4.0583417466378396e-05, "loss": 0.1359, "step": 755 }, { "epoch": 1.27, "learning_rate": 4.0449076788250446e-05, "loss": 0.1283, "step": 760 }, { "epoch": 1.28, "learning_rate": 4.031401048658892e-05, "loss": 0.1762, "step": 765 }, { "epoch": 1.29, "learning_rate": 4.0178224905286635e-05, "loss": 0.2163, "step": 770 }, { "epoch": 1.3, "learning_rate": 4.004172642202002e-05, "loss": 0.2701, "step": 775 }, { "epoch": 1.3, "learning_rate": 3.990452144794966e-05, "loss": 0.247, "step": 780 }, { "epoch": 1.31, "learning_rate": 3.9766616427419076e-05, "loss": 0.1633, "step": 785 }, { "epoch": 1.32, "learning_rate": 3.96280178376521e-05, "loss": 0.2521, "step": 790 }, { "epoch": 1.33, "learning_rate": 3.948873218844863e-05, "loss": 0.1352, "step": 795 }, { "epoch": 1.34, "learning_rate": 3.934876602187886e-05, "loss": 0.1648, "step": 800 }, { "epoch": 1.34, "eval_loss": 0.21747413277626038, "eval_runtime": 177.23, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.128, "step": 800 }, { "epoch": 1.35, "learning_rate": 3.920812591197604e-05, "loss": 0.159, "step": 805 }, { "epoch": 1.35, "learning_rate": 3.9066818464427676e-05, "loss": 0.1805, "step": 810 }, { "epoch": 1.36, "learning_rate": 3.892485031626527e-05, "loss": 0.1216, "step": 815 }, { "epoch": 1.37, "learning_rate": 3.878222813555261e-05, "loss": 0.1362, "step": 820 }, { "epoch": 1.38, "learning_rate": 3.863895862107255e-05, "loss": 0.1452, "step": 825 }, { "epoch": 1.39, "learning_rate": 3.849504850201237e-05, "loss": 0.2143, "step": 830 }, { "epoch": 1.4, "learning_rate": 3.835050453764779e-05, "loss": 0.1941, "step": 835 }, { "epoch": 1.4, "learning_rate": 3.820533351702538e-05, "loss": 0.2172, "step": 840 }, { "epoch": 1.41, "learning_rate": 3.80595422586438e-05, "loss": 0.1827, "step": 845 }, { "epoch": 1.42, "learning_rate": 3.791313761013343e-05, "loss": 0.1103, "step": 850 }, { "epoch": 1.42, "eval_loss": 0.21774926781654358, "eval_runtime": 177.2164, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 850 }, { "epoch": 1.43, "learning_rate": 3.7766126447934857e-05, "loss": 0.3314, "step": 855 }, { "epoch": 1.44, "learning_rate": 3.761851567697583e-05, "loss": 0.1565, "step": 860 }, { "epoch": 1.45, "learning_rate": 3.7470312230346956e-05, "loss": 0.1062, "step": 865 }, { "epoch": 1.45, "learning_rate": 3.732152306897607e-05, "loss": 0.2157, "step": 870 }, { "epoch": 1.46, "learning_rate": 3.717215518130127e-05, "loss": 0.0835, "step": 875 }, { "epoch": 1.47, "learning_rate": 3.702221558294274e-05, "loss": 0.2165, "step": 880 }, { "epoch": 1.48, "learning_rate": 3.687171131637314e-05, "loss": 0.1374, "step": 885 }, { "epoch": 1.49, "learning_rate": 3.6720649450586884e-05, "loss": 0.1446, "step": 890 }, { "epoch": 1.5, "learning_rate": 3.656903708076815e-05, "loss": 0.1472, "step": 895 }, { "epoch": 1.5, "learning_rate": 3.641688132795757e-05, "loss": 0.1456, "step": 900 }, { "epoch": 1.5, "eval_loss": 0.2085915505886078, "eval_runtime": 177.2388, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.128, "step": 900 }, { "epoch": 1.51, "learning_rate": 3.626418933871776e-05, "loss": 0.1535, "step": 905 }, { "epoch": 1.52, "learning_rate": 3.611096828479773e-05, "loss": 0.1476, "step": 910 }, { "epoch": 1.53, "learning_rate": 3.595722536279595e-05, "loss": 0.1419, "step": 915 }, { "epoch": 1.54, "learning_rate": 3.5802967793822384e-05, "loss": 0.1428, "step": 920 }, { "epoch": 1.55, "learning_rate": 3.564820282315932e-05, "loss": 0.1682, "step": 925 }, { "epoch": 1.55, "learning_rate": 3.549293771992104e-05, "loss": 0.11, "step": 930 }, { "epoch": 1.56, "learning_rate": 3.533717977671243e-05, "loss": 0.1534, "step": 935 }, { "epoch": 1.57, "learning_rate": 3.518093630928644e-05, "loss": 0.1175, "step": 940 }, { "epoch": 1.58, "learning_rate": 3.502421465620049e-05, "loss": 0.1287, "step": 945 }, { "epoch": 1.59, "learning_rate": 3.486702217847176e-05, "loss": 0.1042, "step": 950 }, { "epoch": 1.59, "eval_loss": 0.210044726729393, "eval_runtime": 177.2695, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.128, "step": 950 }, { "epoch": 1.6, "learning_rate": 3.470936625923147e-05, "loss": 0.1185, "step": 955 }, { "epoch": 1.6, "learning_rate": 3.455125430337809e-05, "loss": 0.1479, "step": 960 }, { "epoch": 1.61, "learning_rate": 3.439269373722957e-05, "loss": 0.1272, "step": 965 }, { "epoch": 1.62, "learning_rate": 3.4233692008174493e-05, "loss": 0.1147, "step": 970 }, { "epoch": 1.63, "learning_rate": 3.407425658432233e-05, "loss": 0.1177, "step": 975 }, { "epoch": 1.64, "learning_rate": 3.3914394954152636e-05, "loss": 0.2132, "step": 980 }, { "epoch": 1.65, "learning_rate": 3.375411462616332e-05, "loss": 0.0528, "step": 985 }, { "epoch": 1.65, "learning_rate": 3.359342312851802e-05, "loss": 0.1488, "step": 990 }, { "epoch": 1.66, "learning_rate": 3.343232800869247e-05, "loss": 0.111, "step": 995 }, { "epoch": 1.67, "learning_rate": 3.327083683312004e-05, "loss": 0.2429, "step": 1000 }, { "epoch": 1.67, "eval_loss": 0.20871932804584503, "eval_runtime": 177.3553, "eval_samples_per_second": 2.25, "eval_steps_per_second": 1.128, "step": 1000 }, { "epoch": 1.68, "learning_rate": 3.3108957186836346e-05, "loss": 0.0577, "step": 1005 }, { "epoch": 1.69, "learning_rate": 3.294669667312295e-05, "loss": 0.1302, "step": 1010 }, { "epoch": 1.7, "learning_rate": 3.27840629131503e-05, "loss": 0.1289, "step": 1015 }, { "epoch": 1.7, "learning_rate": 3.262106354561973e-05, "loss": 0.153, "step": 1020 }, { "epoch": 1.71, "learning_rate": 3.245770622640471e-05, "loss": 0.1765, "step": 1025 }, { "epoch": 1.72, "learning_rate": 3.2293998628191246e-05, "loss": 0.1737, "step": 1030 }, { "epoch": 1.73, "learning_rate": 3.212994844011748e-05, "loss": 0.1098, "step": 1035 }, { "epoch": 1.74, "learning_rate": 3.196556336741261e-05, "loss": 0.1077, "step": 1040 }, { "epoch": 1.75, "learning_rate": 3.18008511310349e-05, "loss": 0.2765, "step": 1045 }, { "epoch": 1.75, "learning_rate": 3.163581946730909e-05, "loss": 0.1372, "step": 1050 }, { "epoch": 1.75, "eval_loss": 0.2058500349521637, "eval_runtime": 177.254, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.128, "step": 1050 }, { "epoch": 1.76, "learning_rate": 3.147047612756302e-05, "loss": 0.1052, "step": 1055 }, { "epoch": 1.77, "learning_rate": 3.130482887776356e-05, "loss": 0.0977, "step": 1060 }, { "epoch": 1.78, "learning_rate": 3.113888549815184e-05, "loss": 0.1577, "step": 1065 }, { "epoch": 1.79, "learning_rate": 3.097265378287784e-05, "loss": 0.144, "step": 1070 }, { "epoch": 1.8, "learning_rate": 3.080614153963429e-05, "loss": 0.2272, "step": 1075 }, { "epoch": 1.81, "learning_rate": 3.063935658928998e-05, "loss": 0.1185, "step": 1080 }, { "epoch": 1.81, "learning_rate": 3.047230676552239e-05, "loss": 0.1589, "step": 1085 }, { "epoch": 1.82, "learning_rate": 3.0304999914449773e-05, "loss": 0.1232, "step": 1090 }, { "epoch": 1.83, "learning_rate": 3.0137443894262635e-05, "loss": 0.1844, "step": 1095 }, { "epoch": 1.84, "learning_rate": 2.996964657485463e-05, "loss": 0.0875, "step": 1100 }, { "epoch": 1.84, "eval_loss": 0.19900214672088623, "eval_runtime": 177.2089, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 1100 }, { "epoch": 1.85, "learning_rate": 2.980161583745294e-05, "loss": 0.1628, "step": 1105 }, { "epoch": 1.86, "learning_rate": 2.9633359574248075e-05, "loss": 0.1706, "step": 1110 }, { "epoch": 1.86, "learning_rate": 2.9464885688023242e-05, "loss": 0.0676, "step": 1115 }, { "epoch": 1.87, "learning_rate": 2.9296202091783072e-05, "loss": 0.0926, "step": 1120 }, { "epoch": 1.88, "learning_rate": 2.912731670838207e-05, "loss": 0.1236, "step": 1125 }, { "epoch": 1.89, "learning_rate": 2.895823747015237e-05, "loss": 0.1689, "step": 1130 }, { "epoch": 1.9, "learning_rate": 2.878897231853127e-05, "loss": 0.0756, "step": 1135 }, { "epoch": 1.91, "learning_rate": 2.8619529203688163e-05, "loss": 0.1322, "step": 1140 }, { "epoch": 1.91, "learning_rate": 2.8449916084151128e-05, "loss": 0.1436, "step": 1145 }, { "epoch": 1.92, "learning_rate": 2.8280140926433192e-05, "loss": 0.1575, "step": 1150 }, { "epoch": 1.92, "eval_loss": 0.2005121409893036, "eval_runtime": 177.219, "eval_samples_per_second": 2.251, "eval_steps_per_second": 1.129, "step": 1150 }, { "epoch": 1.93, "learning_rate": 2.8110211704658073e-05, "loss": 0.1007, "step": 1155 }, { "epoch": 1.94, "learning_rate": 2.7940136400185695e-05, "loss": 0.1053, "step": 1160 }, { "epoch": 1.95, "learning_rate": 2.7769923001237318e-05, "loss": 0.221, "step": 1165 }, { "epoch": 1.96, "learning_rate": 2.7599579502520295e-05, "loss": 0.1379, "step": 1170 }, { "epoch": 1.96, "learning_rate": 2.7429113904852616e-05, "loss": 0.164, "step": 1175 }, { "epoch": 1.97, "learning_rate": 2.7258534214787108e-05, "loss": 0.141, "step": 1180 }, { "epoch": 1.98, "learning_rate": 2.7087848444235353e-05, "loss": 0.1734, "step": 1185 }, { "epoch": 1.99, "learning_rate": 2.6917064610091423e-05, "loss": 0.2103, "step": 1190 }, { "epoch": 2.0, "learning_rate": 2.674619073385531e-05, "loss": 0.1622, "step": 1195 }, { "epoch": 2.01, "learning_rate": 2.6575234841256137e-05, "loss": 0.0639, "step": 1200 }, { "epoch": 2.01, "eval_loss": 0.19528049230575562, "eval_runtime": 177.2074, "eval_samples_per_second": 2.252, "eval_steps_per_second": 1.129, "step": 1200 } ], "logging_steps": 5, "max_steps": 2392, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "total_flos": 5.908256757514568e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }