{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 711, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.7530055713569785, "learning_rate": 2.7777777777777776e-07, "loss": 1.4781, "step": 1 }, { "epoch": 0.02, "grad_norm": 6.606048584660583, "learning_rate": 1.3888888888888892e-06, "loss": 1.3861, "step": 5 }, { "epoch": 0.04, "grad_norm": 5.138983961367188, "learning_rate": 2.7777777777777783e-06, "loss": 1.4422, "step": 10 }, { "epoch": 0.06, "grad_norm": 1.2528549295189984, "learning_rate": 4.166666666666667e-06, "loss": 1.3677, "step": 15 }, { "epoch": 0.08, "grad_norm": 1.6826024664234687, "learning_rate": 5.555555555555557e-06, "loss": 1.3277, "step": 20 }, { "epoch": 0.11, "grad_norm": 1.049153253140281, "learning_rate": 6.944444444444445e-06, "loss": 1.3513, "step": 25 }, { "epoch": 0.13, "grad_norm": 1.1637754450319215, "learning_rate": 8.333333333333334e-06, "loss": 1.3602, "step": 30 }, { "epoch": 0.15, "grad_norm": 0.9614901239317104, "learning_rate": 9.722222222222223e-06, "loss": 1.3316, "step": 35 }, { "epoch": 0.17, "grad_norm": 0.8800168840313798, "learning_rate": 1.1111111111111113e-05, "loss": 1.2865, "step": 40 }, { "epoch": 0.19, "grad_norm": 0.8743147668040568, "learning_rate": 1.25e-05, "loss": 1.3439, "step": 45 }, { "epoch": 0.21, "grad_norm": 0.8090973361916171, "learning_rate": 1.388888888888889e-05, "loss": 1.3323, "step": 50 }, { "epoch": 0.23, "grad_norm": 0.7791934698691907, "learning_rate": 1.5277777777777777e-05, "loss": 1.3316, "step": 55 }, { "epoch": 0.25, "grad_norm": 0.7332468554264387, "learning_rate": 1.6666666666666667e-05, "loss": 1.3418, "step": 60 }, { "epoch": 0.27, "grad_norm": 0.7367875442634256, "learning_rate": 1.8055555555555558e-05, "loss": 1.3488, "step": 65 }, { "epoch": 0.3, "grad_norm": 3.454271660640428, "learning_rate": 1.9444444444444445e-05, "loss": 1.3402, "step": 70 }, { "epoch": 0.32, "grad_norm": 1.41303669264439, "learning_rate": 1.999891231617599e-05, "loss": 1.3198, "step": 75 }, { "epoch": 0.34, "grad_norm": 0.7111760423501922, "learning_rate": 1.9992266216318037e-05, "loss": 1.291, "step": 80 }, { "epoch": 0.36, "grad_norm": 0.6985734110465796, "learning_rate": 1.997958229642588e-05, "loss": 1.3195, "step": 85 }, { "epoch": 0.38, "grad_norm": 0.6774596159630265, "learning_rate": 1.996086822074945e-05, "loss": 1.3296, "step": 90 }, { "epoch": 0.4, "grad_norm": 0.68714971160374, "learning_rate": 1.9936135297256183e-05, "loss": 1.3106, "step": 95 }, { "epoch": 0.42, "grad_norm": 0.7055100697390134, "learning_rate": 1.9905398470798207e-05, "loss": 1.3251, "step": 100 }, { "epoch": 0.44, "grad_norm": 0.7442856779901117, "learning_rate": 1.9868676314081907e-05, "loss": 1.3154, "step": 105 }, { "epoch": 0.46, "grad_norm": 0.6786674709866586, "learning_rate": 1.9825991016445387e-05, "loss": 1.305, "step": 110 }, { "epoch": 0.49, "grad_norm": 0.701320710348796, "learning_rate": 1.9777368370450582e-05, "loss": 1.2797, "step": 115 }, { "epoch": 0.51, "grad_norm": 0.7694290958620503, "learning_rate": 1.9722837756298112e-05, "loss": 1.3094, "step": 120 }, { "epoch": 0.53, "grad_norm": 0.7198163251696982, "learning_rate": 1.9662432124074325e-05, "loss": 1.3283, "step": 125 }, { "epoch": 0.55, "grad_norm": 0.7451781825097182, "learning_rate": 1.9596187973841218e-05, "loss": 1.2966, "step": 130 }, { "epoch": 0.57, "grad_norm": 0.7153678487864736, "learning_rate": 1.9524145333581315e-05, "loss": 1.3192, "step": 135 }, { "epoch": 0.59, "grad_norm": 2.8370131166342687, "learning_rate": 1.9446347735010765e-05, "loss": 1.2988, "step": 140 }, { "epoch": 0.61, "grad_norm": 0.7273784346266812, "learning_rate": 1.9362842187275354e-05, "loss": 1.2815, "step": 145 }, { "epoch": 0.63, "grad_norm": 0.7610942375358005, "learning_rate": 1.9273679148545246e-05, "loss": 1.3121, "step": 150 }, { "epoch": 0.65, "grad_norm": 0.719303870712118, "learning_rate": 1.917891249552568e-05, "loss": 1.3196, "step": 155 }, { "epoch": 0.68, "grad_norm": 0.6786014692189536, "learning_rate": 1.9078599490901984e-05, "loss": 1.3033, "step": 160 }, { "epoch": 0.7, "grad_norm": 0.6704522856064519, "learning_rate": 1.897280074873868e-05, "loss": 1.2896, "step": 165 }, { "epoch": 0.72, "grad_norm": 0.6769924161484794, "learning_rate": 1.8861580197853423e-05, "loss": 1.2808, "step": 170 }, { "epoch": 0.74, "grad_norm": 0.7274733092656871, "learning_rate": 1.8745005043188104e-05, "loss": 1.3052, "step": 175 }, { "epoch": 0.76, "grad_norm": 0.7143035239809652, "learning_rate": 1.862314572520028e-05, "loss": 1.3003, "step": 180 }, { "epoch": 0.78, "grad_norm": 0.7146392109064191, "learning_rate": 1.8496075877299585e-05, "loss": 1.2748, "step": 185 }, { "epoch": 0.8, "grad_norm": 0.7061207940556439, "learning_rate": 1.8363872281354796e-05, "loss": 1.305, "step": 190 }, { "epoch": 0.82, "grad_norm": 0.6945437033981094, "learning_rate": 1.8226614821298444e-05, "loss": 1.3006, "step": 195 }, { "epoch": 0.84, "grad_norm": 0.6847371880670944, "learning_rate": 1.808438643485698e-05, "loss": 1.2705, "step": 200 }, { "epoch": 0.86, "grad_norm": 0.6784008869899278, "learning_rate": 1.793727306343574e-05, "loss": 1.2532, "step": 205 }, { "epoch": 0.89, "grad_norm": 0.7037870009631069, "learning_rate": 1.7785363600188894e-05, "loss": 1.2638, "step": 210 }, { "epoch": 0.91, "grad_norm": 0.6664930383710185, "learning_rate": 1.762874983630582e-05, "loss": 1.2586, "step": 215 }, { "epoch": 0.93, "grad_norm": 0.6773413303705628, "learning_rate": 1.7467526405546344e-05, "loss": 1.3138, "step": 220 }, { "epoch": 0.95, "grad_norm": 0.6935109696579653, "learning_rate": 1.7301790727058344e-05, "loss": 1.2752, "step": 225 }, { "epoch": 0.97, "grad_norm": 0.7212086786238647, "learning_rate": 1.7131642946512314e-05, "loss": 1.3091, "step": 230 }, { "epoch": 0.99, "grad_norm": 0.7924454572717287, "learning_rate": 1.6957185875588403e-05, "loss": 1.3015, "step": 235 }, { "epoch": 1.0, "eval_loss": 1.2710920572280884, "eval_runtime": 4.2577, "eval_samples_per_second": 346.195, "eval_steps_per_second": 5.637, "step": 237 }, { "epoch": 1.01, "grad_norm": 0.796377689560899, "learning_rate": 1.6778524929852513e-05, "loss": 1.2628, "step": 240 }, { "epoch": 1.03, "grad_norm": 0.7364886186224426, "learning_rate": 1.659576806505905e-05, "loss": 1.2284, "step": 245 }, { "epoch": 1.05, "grad_norm": 0.7049213011696083, "learning_rate": 1.640902571191869e-05, "loss": 1.2187, "step": 250 }, { "epoch": 1.08, "grad_norm": 1.9377903386730986, "learning_rate": 1.6218410709370735e-05, "loss": 1.2438, "step": 255 }, { "epoch": 1.1, "grad_norm": 0.7011828464498495, "learning_rate": 1.6024038236400246e-05, "loss": 1.1998, "step": 260 }, { "epoch": 1.12, "grad_norm": 2.5996069923788436, "learning_rate": 1.582602574244121e-05, "loss": 1.1919, "step": 265 }, { "epoch": 1.14, "grad_norm": 0.679979871549876, "learning_rate": 1.562449287640781e-05, "loss": 1.2456, "step": 270 }, { "epoch": 1.16, "grad_norm": 0.71927352282914, "learning_rate": 1.5419561414396657e-05, "loss": 1.2493, "step": 275 }, { "epoch": 1.18, "grad_norm": 0.7190913662714845, "learning_rate": 1.5211355186103655e-05, "loss": 1.2058, "step": 280 }, { "epoch": 1.2, "grad_norm": 0.7182017182110072, "learning_rate": 1.5000000000000002e-05, "loss": 1.2371, "step": 285 }, { "epoch": 1.22, "grad_norm": 0.705180903431495, "learning_rate": 1.4785623567312492e-05, "loss": 1.1976, "step": 290 }, { "epoch": 1.24, "grad_norm": 0.6883033004979775, "learning_rate": 1.4568355424854113e-05, "loss": 1.2768, "step": 295 }, { "epoch": 1.27, "grad_norm": 0.6815574246451964, "learning_rate": 1.4348326856751496e-05, "loss": 1.2334, "step": 300 }, { "epoch": 1.29, "grad_norm": 0.6804999351991194, "learning_rate": 1.412567081511659e-05, "loss": 1.2183, "step": 305 }, { "epoch": 1.31, "grad_norm": 0.7193921850993512, "learning_rate": 1.3900521839710428e-05, "loss": 1.2032, "step": 310 }, { "epoch": 1.33, "grad_norm": 0.6954927887572464, "learning_rate": 1.367301597664757e-05, "loss": 1.2289, "step": 315 }, { "epoch": 1.35, "grad_norm": 0.708546670689236, "learning_rate": 1.3443290696190335e-05, "loss": 1.229, "step": 320 }, { "epoch": 1.37, "grad_norm": 0.7124305795556933, "learning_rate": 1.3211484809682482e-05, "loss": 1.2434, "step": 325 }, { "epoch": 1.39, "grad_norm": 0.6866449286630997, "learning_rate": 1.2977738385672558e-05, "loss": 1.2587, "step": 330 }, { "epoch": 1.41, "grad_norm": 0.7158743341618767, "learning_rate": 1.2742192665277569e-05, "loss": 1.2522, "step": 335 }, { "epoch": 1.43, "grad_norm": 0.6741832926767023, "learning_rate": 1.250498997683813e-05, "loss": 1.2318, "step": 340 }, { "epoch": 1.46, "grad_norm": 0.7018162451875303, "learning_rate": 1.2266273649916669e-05, "loss": 1.2017, "step": 345 }, { "epoch": 1.48, "grad_norm": 0.6814770297513967, "learning_rate": 1.202618792869063e-05, "loss": 1.2043, "step": 350 }, { "epoch": 1.5, "grad_norm": 0.6877213251148805, "learning_rate": 1.178487788479303e-05, "loss": 1.216, "step": 355 }, { "epoch": 1.52, "grad_norm": 0.6722566603288663, "learning_rate": 1.1542489329653024e-05, "loss": 1.2367, "step": 360 }, { "epoch": 1.54, "grad_norm": 0.6835971864599367, "learning_rate": 1.129916872638945e-05, "loss": 1.2611, "step": 365 }, { "epoch": 1.56, "grad_norm": 0.7040578619734443, "learning_rate": 1.1055063101310581e-05, "loss": 1.2252, "step": 370 }, { "epoch": 1.58, "grad_norm": 0.679881563899439, "learning_rate": 1.08103199550736e-05, "loss": 1.2321, "step": 375 }, { "epoch": 1.6, "grad_norm": 0.6794312105046864, "learning_rate": 1.0565087173557396e-05, "loss": 1.2252, "step": 380 }, { "epoch": 1.62, "grad_norm": 0.6889618373928458, "learning_rate": 1.0319512938502654e-05, "loss": 1.2256, "step": 385 }, { "epoch": 1.65, "grad_norm": 0.6921906751981153, "learning_rate": 1.0073745637973125e-05, "loss": 1.2344, "step": 390 }, { "epoch": 1.67, "grad_norm": 0.6993543043577237, "learning_rate": 9.827933776692236e-06, "loss": 1.2391, "step": 395 }, { "epoch": 1.69, "grad_norm": 0.6959462039565184, "learning_rate": 9.582225886309217e-06, "loss": 1.2155, "step": 400 }, { "epoch": 1.71, "grad_norm": 0.6863551427574576, "learning_rate": 9.336770435648963e-06, "loss": 1.252, "step": 405 }, { "epoch": 1.73, "grad_norm": 0.7184772923029549, "learning_rate": 9.091715740999829e-06, "loss": 1.2295, "step": 410 }, { "epoch": 1.75, "grad_norm": 0.6949962185725445, "learning_rate": 8.84720987649363e-06, "loss": 1.2044, "step": 415 }, { "epoch": 1.77, "grad_norm": 0.6710588422516326, "learning_rate": 8.60340058463194e-06, "loss": 1.2407, "step": 420 }, { "epoch": 1.79, "grad_norm": 0.6482064194341762, "learning_rate": 8.360435187012789e-06, "loss": 1.2637, "step": 425 }, { "epoch": 1.81, "grad_norm": 0.6715553002740511, "learning_rate": 8.118460495311687e-06, "loss": 1.2795, "step": 430 }, { "epoch": 1.84, "grad_norm": 1.1764818802442565, "learning_rate": 7.877622722570772e-06, "loss": 1.2228, "step": 435 }, { "epoch": 1.86, "grad_norm": 0.6932612325260208, "learning_rate": 7.638067394849672e-06, "loss": 1.2272, "step": 440 }, { "epoch": 1.88, "grad_norm": 0.7017921905179951, "learning_rate": 7.3999392632914936e-06, "loss": 1.2476, "step": 445 }, { "epoch": 1.9, "grad_norm": 0.6556538474805375, "learning_rate": 7.163382216657033e-06, "loss": 1.2249, "step": 450 }, { "epoch": 1.92, "grad_norm": 0.6649783838358192, "learning_rate": 6.928539194380101e-06, "loss": 1.2166, "step": 455 }, { "epoch": 1.94, "grad_norm": 0.6582704106972542, "learning_rate": 6.6955521001964526e-06, "loss": 1.2149, "step": 460 }, { "epoch": 1.96, "grad_norm": 0.6842373684969378, "learning_rate": 6.464561716398565e-06, "loss": 1.2089, "step": 465 }, { "epoch": 1.98, "grad_norm": 0.6763842453630291, "learning_rate": 6.2357076187680325e-06, "loss": 1.2352, "step": 470 }, { "epoch": 2.0, "eval_loss": 1.2562438249588013, "eval_runtime": 4.0229, "eval_samples_per_second": 366.403, "eval_steps_per_second": 5.966, "step": 474 }, { "epoch": 2.0, "grad_norm": 0.703770585858162, "learning_rate": 6.009128092236983e-06, "loss": 1.2048, "step": 475 }, { "epoch": 2.03, "grad_norm": 0.6786892775253128, "learning_rate": 5.78496004732952e-06, "loss": 1.1518, "step": 480 }, { "epoch": 2.05, "grad_norm": 0.70326378053169, "learning_rate": 5.563338937433622e-06, "loss": 1.197, "step": 485 }, { "epoch": 2.07, "grad_norm": 0.6916875060651468, "learning_rate": 5.344398676953526e-06, "loss": 1.2111, "step": 490 }, { "epoch": 2.09, "grad_norm": 0.6475152204206446, "learning_rate": 5.128271560392037e-06, "loss": 1.1739, "step": 495 }, { "epoch": 2.11, "grad_norm": 0.6813942164053138, "learning_rate": 4.915088182411674e-06, "loss": 1.1905, "step": 500 }, { "epoch": 2.13, "grad_norm": 0.6486602612694303, "learning_rate": 4.7049773589229306e-06, "loss": 1.2063, "step": 505 }, { "epoch": 2.15, "grad_norm": 0.6731868552171755, "learning_rate": 4.498066049247344e-06, "loss": 1.2198, "step": 510 }, { "epoch": 2.17, "grad_norm": 0.685788252482629, "learning_rate": 4.29447927940242e-06, "loss": 1.1706, "step": 515 }, { "epoch": 2.19, "grad_norm": 0.6713780690925405, "learning_rate": 4.094340066554742e-06, "loss": 1.1887, "step": 520 }, { "epoch": 2.22, "grad_norm": 0.6669181028932704, "learning_rate": 3.897769344686929e-06, "loss": 1.1933, "step": 525 }, { "epoch": 2.24, "grad_norm": 0.6681179890022899, "learning_rate": 3.7048858915233665e-06, "loss": 1.2061, "step": 530 }, { "epoch": 2.26, "grad_norm": 0.6688074058150556, "learning_rate": 3.5158062567588468e-06, "loss": 1.1585, "step": 535 }, { "epoch": 2.28, "grad_norm": 0.6500738740113599, "learning_rate": 3.330644691633492e-06, "loss": 1.1793, "step": 540 }, { "epoch": 2.3, "grad_norm": 0.6546313173839559, "learning_rate": 3.149513079896521e-06, "loss": 1.2088, "step": 545 }, { "epoch": 2.32, "grad_norm": 0.6544093029161898, "learning_rate": 2.9725208702005736e-06, "loss": 1.1975, "step": 550 }, { "epoch": 2.34, "grad_norm": 0.6458002129205276, "learning_rate": 2.7997750099674282e-06, "loss": 1.1913, "step": 555 }, { "epoch": 2.36, "grad_norm": 0.6571017394804074, "learning_rate": 2.631379880765107e-06, "loss": 1.1876, "step": 560 }, { "epoch": 2.38, "grad_norm": 0.6563720392014831, "learning_rate": 2.467437235235378e-06, "loss": 1.1677, "step": 565 }, { "epoch": 2.41, "grad_norm": 0.6640825824608834, "learning_rate": 2.3080461356097938e-06, "loss": 1.1876, "step": 570 }, { "epoch": 2.43, "grad_norm": 0.6332904461492966, "learning_rate": 2.153302893851401e-06, "loss": 1.1553, "step": 575 }, { "epoch": 2.45, "grad_norm": 0.654023164181366, "learning_rate": 2.0033010134583085e-06, "loss": 1.1896, "step": 580 }, { "epoch": 2.47, "grad_norm": 0.6359350507881623, "learning_rate": 1.8581311329642592e-06, "loss": 1.1526, "step": 585 }, { "epoch": 2.49, "grad_norm": 0.6238559177863219, "learning_rate": 1.7178809711703525e-06, "loss": 1.1847, "step": 590 }, { "epoch": 2.51, "grad_norm": 0.6479836099852851, "learning_rate": 1.5826352741410333e-06, "loss": 1.1883, "step": 595 }, { "epoch": 2.53, "grad_norm": 0.6615350392909103, "learning_rate": 1.452475763996326e-06, "loss": 1.1562, "step": 600 }, { "epoch": 2.55, "grad_norm": 0.659005508964398, "learning_rate": 1.3274810895313083e-06, "loss": 1.1666, "step": 605 }, { "epoch": 2.57, "grad_norm": 0.6530124794164938, "learning_rate": 1.207726778692625e-06, "loss": 1.1694, "step": 610 }, { "epoch": 2.59, "grad_norm": 0.637671650897288, "learning_rate": 1.0932851929407828e-06, "loss": 1.21, "step": 615 }, { "epoch": 2.62, "grad_norm": 0.6426650375048046, "learning_rate": 9.842254835257792e-07, "loss": 1.1983, "step": 620 }, { "epoch": 2.64, "grad_norm": 0.6549427233533902, "learning_rate": 8.806135497025181e-07, "loss": 1.1796, "step": 625 }, { "epoch": 2.66, "grad_norm": 0.637483284013688, "learning_rate": 7.825119989112173e-07, "loss": 1.2032, "step": 630 }, { "epoch": 2.68, "grad_norm": 0.6458643762516288, "learning_rate": 6.899801089469205e-07, "loss": 1.1898, "step": 635 }, { "epoch": 2.7, "grad_norm": 0.6560437400254089, "learning_rate": 6.030737921409169e-07, "loss": 1.1815, "step": 640 }, { "epoch": 2.72, "grad_norm": 0.6346340189110924, "learning_rate": 5.218455615757601e-07, "loss": 1.1829, "step": 645 }, { "epoch": 2.74, "grad_norm": 0.6408145596532006, "learning_rate": 4.4634449935427203e-07, "loss": 1.1653, "step": 650 }, { "epoch": 2.76, "grad_norm": 0.6553954626425612, "learning_rate": 3.7661622694171393e-07, "loss": 1.2129, "step": 655 }, { "epoch": 2.78, "grad_norm": 0.6949305164155417, "learning_rate": 3.127028775990515e-07, "loss": 1.1822, "step": 660 }, { "epoch": 2.81, "grad_norm": 0.6657650684680331, "learning_rate": 2.546430709239578e-07, "loss": 1.1818, "step": 665 }, { "epoch": 2.83, "grad_norm": 0.6438354912565324, "learning_rate": 2.02471889514948e-07, "loss": 1.1696, "step": 670 }, { "epoch": 2.85, "grad_norm": 0.6330124825001247, "learning_rate": 1.562208577727442e-07, "loss": 1.1443, "step": 675 }, { "epoch": 2.87, "grad_norm": 0.6389378157102353, "learning_rate": 1.1591792285167603e-07, "loss": 1.1657, "step": 680 }, { "epoch": 2.89, "grad_norm": 0.6353907126618263, "learning_rate": 8.158743777263334e-08, "loss": 1.183, "step": 685 }, { "epoch": 2.91, "grad_norm": 0.644204938538006, "learning_rate": 5.325014670776951e-08, "loss": 1.1641, "step": 690 }, { "epoch": 2.93, "grad_norm": 0.6585816956828645, "learning_rate": 3.092317244584919e-08, "loss": 1.2134, "step": 695 }, { "epoch": 2.95, "grad_norm": 0.6523630752795228, "learning_rate": 1.4620006045816814e-08, "loss": 1.2174, "step": 700 }, { "epoch": 2.97, "grad_norm": 0.659207596228058, "learning_rate": 4.3504986848297295e-09, "loss": 1.1732, "step": 705 }, { "epoch": 3.0, "grad_norm": 0.6428302138785816, "learning_rate": 1.2085570569642102e-10, "loss": 1.1719, "step": 710 }, { "epoch": 3.0, "eval_loss": 1.2565690279006958, "eval_runtime": 4.0115, "eval_samples_per_second": 367.448, "eval_steps_per_second": 5.983, "step": 711 }, { "epoch": 3.0, "step": 711, "total_flos": 138562624290816.0, "train_loss": 1.242754081298195, "train_runtime": 610.9223, "train_samples_per_second": 148.462, "train_steps_per_second": 1.164 } ], "logging_steps": 5, "max_steps": 711, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 138562624290816.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }