{ "best_metric": 0.7993046045303345, "best_model_checkpoint": "./colab20240326ryan/checkpoint-2100", "epoch": 1.160092807424594, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.9214508533477783, "learning_rate": 0.0001997679814385151, "loss": 1.7231, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.908971905708313, "learning_rate": 0.00019953596287703018, "loss": 1.5448, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.5993173122406006, "learning_rate": 0.00019930394431554523, "loss": 1.3519, "step": 30 }, { "epoch": 0.02, "grad_norm": 4.219737529754639, "learning_rate": 0.00019907192575406032, "loss": 1.4004, "step": 40 }, { "epoch": 0.02, "grad_norm": 2.771422863006592, "learning_rate": 0.00019883990719257543, "loss": 1.2249, "step": 50 }, { "epoch": 0.03, "grad_norm": 2.5087692737579346, "learning_rate": 0.0001986078886310905, "loss": 1.3035, "step": 60 }, { "epoch": 0.03, "grad_norm": 2.224257707595825, "learning_rate": 0.0001983758700696056, "loss": 1.2159, "step": 70 }, { "epoch": 0.04, "grad_norm": 2.9843618869781494, "learning_rate": 0.00019814385150812065, "loss": 1.0327, "step": 80 }, { "epoch": 0.04, "grad_norm": 1.905066967010498, "learning_rate": 0.00019791183294663573, "loss": 1.0393, "step": 90 }, { "epoch": 0.05, "grad_norm": 4.216238021850586, "learning_rate": 0.00019767981438515082, "loss": 1.1654, "step": 100 }, { "epoch": 0.05, "eval_accuracy": 0.5813018346318171, "eval_loss": 1.081552505493164, "eval_runtime": 142.0814, "eval_samples_per_second": 28.005, "eval_steps_per_second": 3.505, "step": 100 }, { "epoch": 0.05, "grad_norm": 2.5341720581054688, "learning_rate": 0.0001974477958236659, "loss": 1.162, "step": 110 }, { "epoch": 0.06, "grad_norm": 3.2532262802124023, "learning_rate": 0.00019721577726218098, "loss": 1.04, "step": 120 }, { "epoch": 0.06, "grad_norm": 2.41017746925354, "learning_rate": 0.00019698375870069607, "loss": 1.0566, "step": 130 }, { "epoch": 0.06, "grad_norm": 1.7879230976104736, "learning_rate": 0.00019675174013921115, "loss": 1.0105, "step": 140 }, { "epoch": 0.07, "grad_norm": 3.4111428260803223, "learning_rate": 0.00019651972157772623, "loss": 1.0504, "step": 150 }, { "epoch": 0.07, "grad_norm": 3.4261136054992676, "learning_rate": 0.00019628770301624132, "loss": 1.0128, "step": 160 }, { "epoch": 0.08, "grad_norm": 4.207861423492432, "learning_rate": 0.0001960556844547564, "loss": 0.9814, "step": 170 }, { "epoch": 0.08, "grad_norm": 3.604964256286621, "learning_rate": 0.00019582366589327148, "loss": 1.17, "step": 180 }, { "epoch": 0.09, "grad_norm": 2.710505485534668, "learning_rate": 0.00019559164733178654, "loss": 1.0255, "step": 190 }, { "epoch": 0.09, "grad_norm": 4.121289253234863, "learning_rate": 0.00019535962877030162, "loss": 1.1321, "step": 200 }, { "epoch": 0.09, "eval_accuracy": 0.5996481528022116, "eval_loss": 0.9905579686164856, "eval_runtime": 135.9688, "eval_samples_per_second": 29.264, "eval_steps_per_second": 3.663, "step": 200 }, { "epoch": 0.1, "grad_norm": 2.4283132553100586, "learning_rate": 0.0001951276102088167, "loss": 1.0087, "step": 210 }, { "epoch": 0.1, "grad_norm": 2.141366958618164, "learning_rate": 0.0001948955916473318, "loss": 0.8722, "step": 220 }, { "epoch": 0.11, "grad_norm": 3.6067655086517334, "learning_rate": 0.00019466357308584687, "loss": 1.0173, "step": 230 }, { "epoch": 0.11, "grad_norm": 2.5523955821990967, "learning_rate": 0.00019445475638051046, "loss": 0.9303, "step": 240 }, { "epoch": 0.12, "grad_norm": 2.983736753463745, "learning_rate": 0.00019422273781902555, "loss": 0.9035, "step": 250 }, { "epoch": 0.12, "grad_norm": 3.1925017833709717, "learning_rate": 0.00019399071925754063, "loss": 0.9329, "step": 260 }, { "epoch": 0.13, "grad_norm": 4.603178977966309, "learning_rate": 0.00019375870069605569, "loss": 0.9351, "step": 270 }, { "epoch": 0.13, "grad_norm": 3.129456043243408, "learning_rate": 0.00019352668213457077, "loss": 1.0367, "step": 280 }, { "epoch": 0.13, "grad_norm": 3.650508403778076, "learning_rate": 0.00019329466357308585, "loss": 0.9677, "step": 290 }, { "epoch": 0.14, "grad_norm": 2.0717406272888184, "learning_rate": 0.00019306264501160094, "loss": 0.9389, "step": 300 }, { "epoch": 0.14, "eval_accuracy": 0.625031414928374, "eval_loss": 0.9222464561462402, "eval_runtime": 133.436, "eval_samples_per_second": 29.82, "eval_steps_per_second": 3.732, "step": 300 }, { "epoch": 0.14, "grad_norm": 3.079808473587036, "learning_rate": 0.00019283062645011602, "loss": 0.9204, "step": 310 }, { "epoch": 0.15, "grad_norm": 3.8033320903778076, "learning_rate": 0.0001925986078886311, "loss": 0.9505, "step": 320 }, { "epoch": 0.15, "grad_norm": 3.029008150100708, "learning_rate": 0.0001923665893271462, "loss": 0.9715, "step": 330 }, { "epoch": 0.16, "grad_norm": 2.543546199798584, "learning_rate": 0.00019213457076566127, "loss": 0.9928, "step": 340 }, { "epoch": 0.16, "grad_norm": 1.7682580947875977, "learning_rate": 0.00019190255220417635, "loss": 1.0367, "step": 350 }, { "epoch": 0.17, "grad_norm": 2.006638526916504, "learning_rate": 0.00019167053364269144, "loss": 0.932, "step": 360 }, { "epoch": 0.17, "grad_norm": 2.4894397258758545, "learning_rate": 0.00019143851508120652, "loss": 0.8321, "step": 370 }, { "epoch": 0.18, "grad_norm": 3.1492834091186523, "learning_rate": 0.00019120649651972158, "loss": 1.0013, "step": 380 }, { "epoch": 0.18, "grad_norm": 4.911842346191406, "learning_rate": 0.00019097447795823666, "loss": 0.987, "step": 390 }, { "epoch": 0.19, "grad_norm": 2.6950294971466064, "learning_rate": 0.00019074245939675174, "loss": 0.816, "step": 400 }, { "epoch": 0.19, "eval_accuracy": 0.5740135712490575, "eval_loss": 1.0586621761322021, "eval_runtime": 131.2597, "eval_samples_per_second": 30.314, "eval_steps_per_second": 3.794, "step": 400 }, { "epoch": 0.19, "grad_norm": 2.0810515880584717, "learning_rate": 0.00019051044083526683, "loss": 1.0748, "step": 410 }, { "epoch": 0.19, "grad_norm": 3.0597758293151855, "learning_rate": 0.0001902784222737819, "loss": 0.851, "step": 420 }, { "epoch": 0.2, "grad_norm": 2.7524383068084717, "learning_rate": 0.000190046403712297, "loss": 0.9123, "step": 430 }, { "epoch": 0.2, "grad_norm": 4.255000591278076, "learning_rate": 0.00018981438515081208, "loss": 1.0082, "step": 440 }, { "epoch": 0.21, "grad_norm": 2.834663152694702, "learning_rate": 0.00018958236658932716, "loss": 1.0721, "step": 450 }, { "epoch": 0.21, "grad_norm": 2.59566593170166, "learning_rate": 0.00018935034802784224, "loss": 0.9448, "step": 460 }, { "epoch": 0.22, "grad_norm": 2.0671868324279785, "learning_rate": 0.00018911832946635733, "loss": 0.823, "step": 470 }, { "epoch": 0.22, "grad_norm": 2.022857189178467, "learning_rate": 0.00018888631090487238, "loss": 0.9282, "step": 480 }, { "epoch": 0.23, "grad_norm": 1.6197035312652588, "learning_rate": 0.00018865429234338747, "loss": 0.8492, "step": 490 }, { "epoch": 0.23, "grad_norm": 2.2592787742614746, "learning_rate": 0.00018842227378190255, "loss": 0.7273, "step": 500 }, { "epoch": 0.23, "eval_accuracy": 0.6267906509173159, "eval_loss": 0.918483555316925, "eval_runtime": 131.2522, "eval_samples_per_second": 30.316, "eval_steps_per_second": 3.794, "step": 500 }, { "epoch": 0.24, "grad_norm": 3.200505256652832, "learning_rate": 0.00018819025522041763, "loss": 0.9205, "step": 510 }, { "epoch": 0.24, "grad_norm": 2.9970719814300537, "learning_rate": 0.00018795823665893272, "loss": 0.8762, "step": 520 }, { "epoch": 0.25, "grad_norm": 1.8891489505767822, "learning_rate": 0.00018772621809744783, "loss": 0.9051, "step": 530 }, { "epoch": 0.25, "grad_norm": 2.4764907360076904, "learning_rate": 0.00018749419953596288, "loss": 0.9494, "step": 540 }, { "epoch": 0.26, "grad_norm": 2.9991800785064697, "learning_rate": 0.00018726218097447797, "loss": 0.9639, "step": 550 }, { "epoch": 0.26, "grad_norm": 2.954806327819824, "learning_rate": 0.00018703016241299305, "loss": 0.9069, "step": 560 }, { "epoch": 0.26, "grad_norm": 3.1720399856567383, "learning_rate": 0.00018679814385150813, "loss": 0.84, "step": 570 }, { "epoch": 0.27, "grad_norm": 1.5639662742614746, "learning_rate": 0.00018656612529002322, "loss": 0.8589, "step": 580 }, { "epoch": 0.27, "grad_norm": 3.4077460765838623, "learning_rate": 0.00018633410672853827, "loss": 0.8529, "step": 590 }, { "epoch": 0.28, "grad_norm": 3.089357852935791, "learning_rate": 0.00018610208816705336, "loss": 0.8282, "step": 600 }, { "epoch": 0.28, "eval_accuracy": 0.6293038451872329, "eval_loss": 0.9175940155982971, "eval_runtime": 132.213, "eval_samples_per_second": 30.095, "eval_steps_per_second": 3.767, "step": 600 }, { "epoch": 0.28, "grad_norm": 2.7197611331939697, "learning_rate": 0.00018587006960556844, "loss": 0.8377, "step": 610 }, { "epoch": 0.29, "grad_norm": 2.838669538497925, "learning_rate": 0.00018563805104408355, "loss": 0.88, "step": 620 }, { "epoch": 0.29, "grad_norm": 4.2069902420043945, "learning_rate": 0.00018540603248259864, "loss": 0.8175, "step": 630 }, { "epoch": 0.3, "grad_norm": 3.0792770385742188, "learning_rate": 0.0001851740139211137, "loss": 0.8921, "step": 640 }, { "epoch": 0.3, "grad_norm": 3.4577174186706543, "learning_rate": 0.00018494199535962877, "loss": 0.8612, "step": 650 }, { "epoch": 0.31, "grad_norm": 3.424455165863037, "learning_rate": 0.00018470997679814386, "loss": 0.9775, "step": 660 }, { "epoch": 0.31, "grad_norm": 2.300741672515869, "learning_rate": 0.00018447795823665894, "loss": 1.0909, "step": 670 }, { "epoch": 0.32, "grad_norm": 1.8668731451034546, "learning_rate": 0.00018424593967517403, "loss": 0.8205, "step": 680 }, { "epoch": 0.32, "grad_norm": 3.170844793319702, "learning_rate": 0.0001840139211136891, "loss": 0.8701, "step": 690 }, { "epoch": 0.32, "grad_norm": 2.8682425022125244, "learning_rate": 0.00018378190255220417, "loss": 0.8, "step": 700 }, { "epoch": 0.32, "eval_accuracy": 0.6272932897712993, "eval_loss": 0.9006840586662292, "eval_runtime": 129.3938, "eval_samples_per_second": 30.751, "eval_steps_per_second": 3.849, "step": 700 }, { "epoch": 0.33, "grad_norm": 3.769883871078491, "learning_rate": 0.00018354988399071928, "loss": 0.9765, "step": 710 }, { "epoch": 0.33, "grad_norm": 3.9122543334960938, "learning_rate": 0.00018331786542923436, "loss": 0.9744, "step": 720 }, { "epoch": 0.34, "grad_norm": 3.644559860229492, "learning_rate": 0.00018308584686774944, "loss": 0.8877, "step": 730 }, { "epoch": 0.34, "grad_norm": 3.476562976837158, "learning_rate": 0.00018285382830626453, "loss": 0.867, "step": 740 }, { "epoch": 0.35, "grad_norm": 3.0982306003570557, "learning_rate": 0.00018262180974477958, "loss": 0.9965, "step": 750 }, { "epoch": 0.35, "grad_norm": 2.395843505859375, "learning_rate": 0.00018238979118329467, "loss": 0.8635, "step": 760 }, { "epoch": 0.36, "grad_norm": 3.6630685329437256, "learning_rate": 0.00018215777262180975, "loss": 0.8562, "step": 770 }, { "epoch": 0.36, "grad_norm": 1.064682960510254, "learning_rate": 0.00018192575406032483, "loss": 0.7875, "step": 780 }, { "epoch": 0.37, "grad_norm": 3.3986759185791016, "learning_rate": 0.00018169373549883992, "loss": 0.8583, "step": 790 }, { "epoch": 0.37, "grad_norm": 3.0697574615478516, "learning_rate": 0.000181461716937355, "loss": 0.8777, "step": 800 }, { "epoch": 0.37, "eval_accuracy": 0.6202563458155316, "eval_loss": 0.9337747693061829, "eval_runtime": 131.2508, "eval_samples_per_second": 30.316, "eval_steps_per_second": 3.794, "step": 800 }, { "epoch": 0.38, "grad_norm": 1.8958779573440552, "learning_rate": 0.00018122969837587008, "loss": 0.9388, "step": 810 }, { "epoch": 0.38, "grad_norm": 1.3388631343841553, "learning_rate": 0.00018099767981438517, "loss": 0.8248, "step": 820 }, { "epoch": 0.39, "grad_norm": 4.1161298751831055, "learning_rate": 0.00018076566125290025, "loss": 0.8008, "step": 830 }, { "epoch": 0.39, "grad_norm": 1.5049020051956177, "learning_rate": 0.00018053364269141533, "loss": 0.8932, "step": 840 }, { "epoch": 0.39, "grad_norm": 2.5027551651000977, "learning_rate": 0.00018030162412993042, "loss": 0.7656, "step": 850 }, { "epoch": 0.4, "grad_norm": 2.0704867839813232, "learning_rate": 0.00018006960556844547, "loss": 0.8206, "step": 860 }, { "epoch": 0.4, "grad_norm": 3.6864800453186035, "learning_rate": 0.00017983758700696056, "loss": 0.8875, "step": 870 }, { "epoch": 0.41, "grad_norm": 3.732292890548706, "learning_rate": 0.00017960556844547564, "loss": 0.9411, "step": 880 }, { "epoch": 0.41, "grad_norm": 3.7439608573913574, "learning_rate": 0.00017937354988399072, "loss": 0.904, "step": 890 }, { "epoch": 0.42, "grad_norm": 2.159684181213379, "learning_rate": 0.0001791415313225058, "loss": 0.7142, "step": 900 }, { "epoch": 0.42, "eval_accuracy": 0.614727318421714, "eval_loss": 0.9442586302757263, "eval_runtime": 130.2393, "eval_samples_per_second": 30.551, "eval_steps_per_second": 3.824, "step": 900 }, { "epoch": 0.42, "grad_norm": 2.202846050262451, "learning_rate": 0.0001789095127610209, "loss": 0.8798, "step": 910 }, { "epoch": 0.43, "grad_norm": 2.4513931274414062, "learning_rate": 0.00017867749419953597, "loss": 0.8558, "step": 920 }, { "epoch": 0.43, "grad_norm": 2.3939168453216553, "learning_rate": 0.00017844547563805106, "loss": 0.917, "step": 930 }, { "epoch": 0.44, "grad_norm": 2.8509578704833984, "learning_rate": 0.00017821345707656614, "loss": 0.8373, "step": 940 }, { "epoch": 0.44, "grad_norm": 1.6446682214736938, "learning_rate": 0.00017798143851508122, "loss": 0.7641, "step": 950 }, { "epoch": 0.45, "grad_norm": 2.431823968887329, "learning_rate": 0.00017774941995359628, "loss": 0.8709, "step": 960 }, { "epoch": 0.45, "grad_norm": 2.1793901920318604, "learning_rate": 0.00017751740139211136, "loss": 0.7838, "step": 970 }, { "epoch": 0.45, "grad_norm": 2.03481125831604, "learning_rate": 0.00017728538283062645, "loss": 0.844, "step": 980 }, { "epoch": 0.46, "grad_norm": 2.9181933403015137, "learning_rate": 0.00017705336426914153, "loss": 0.9402, "step": 990 }, { "epoch": 0.46, "grad_norm": 2.2899041175842285, "learning_rate": 0.00017682134570765661, "loss": 0.8452, "step": 1000 }, { "epoch": 0.46, "eval_accuracy": 0.6282985674792662, "eval_loss": 0.8846696615219116, "eval_runtime": 130.7794, "eval_samples_per_second": 30.425, "eval_steps_per_second": 3.808, "step": 1000 }, { "epoch": 0.47, "grad_norm": 3.7156896591186523, "learning_rate": 0.00017658932714617172, "loss": 0.8338, "step": 1010 }, { "epoch": 0.47, "grad_norm": 1.9189355373382568, "learning_rate": 0.00017635730858468678, "loss": 0.9293, "step": 1020 }, { "epoch": 0.48, "grad_norm": 3.5769336223602295, "learning_rate": 0.00017612529002320186, "loss": 0.9528, "step": 1030 }, { "epoch": 0.48, "grad_norm": 3.103059768676758, "learning_rate": 0.00017589327146171695, "loss": 0.8122, "step": 1040 }, { "epoch": 0.49, "grad_norm": 1.972256064414978, "learning_rate": 0.00017566125290023203, "loss": 0.9213, "step": 1050 }, { "epoch": 0.49, "grad_norm": 2.265113592147827, "learning_rate": 0.00017542923433874711, "loss": 0.9075, "step": 1060 }, { "epoch": 0.5, "grad_norm": 2.6354522705078125, "learning_rate": 0.00017519721577726217, "loss": 0.8548, "step": 1070 }, { "epoch": 0.5, "grad_norm": 4.182709217071533, "learning_rate": 0.00017496519721577725, "loss": 0.7877, "step": 1080 }, { "epoch": 0.51, "grad_norm": 2.5550811290740967, "learning_rate": 0.00017473317865429236, "loss": 0.9916, "step": 1090 }, { "epoch": 0.51, "grad_norm": 2.5702245235443115, "learning_rate": 0.00017450116009280745, "loss": 0.845, "step": 1100 }, { "epoch": 0.51, "eval_accuracy": 0.6622266901231465, "eval_loss": 0.8412047624588013, "eval_runtime": 129.4336, "eval_samples_per_second": 30.742, "eval_steps_per_second": 3.848, "step": 1100 }, { "epoch": 0.52, "grad_norm": 3.416830539703369, "learning_rate": 0.00017426914153132253, "loss": 0.6856, "step": 1110 }, { "epoch": 0.52, "grad_norm": 1.638490915298462, "learning_rate": 0.0001740371229698376, "loss": 0.7501, "step": 1120 }, { "epoch": 0.52, "grad_norm": 4.172976016998291, "learning_rate": 0.00017380510440835267, "loss": 0.8201, "step": 1130 }, { "epoch": 0.53, "grad_norm": 2.498607873916626, "learning_rate": 0.00017357308584686775, "loss": 0.7571, "step": 1140 }, { "epoch": 0.53, "grad_norm": 5.480504035949707, "learning_rate": 0.00017334106728538284, "loss": 0.7706, "step": 1150 }, { "epoch": 0.54, "grad_norm": 3.2535948753356934, "learning_rate": 0.00017310904872389792, "loss": 0.8646, "step": 1160 }, { "epoch": 0.54, "grad_norm": 4.1205878257751465, "learning_rate": 0.000172877030162413, "loss": 0.9275, "step": 1170 }, { "epoch": 0.55, "grad_norm": 3.1862285137176514, "learning_rate": 0.0001726450116009281, "loss": 0.7335, "step": 1180 }, { "epoch": 0.55, "grad_norm": 2.7202231884002686, "learning_rate": 0.00017241299303944317, "loss": 0.7428, "step": 1190 }, { "epoch": 0.56, "grad_norm": 2.3965518474578857, "learning_rate": 0.00017218097447795826, "loss": 0.9167, "step": 1200 }, { "epoch": 0.56, "eval_accuracy": 0.6526765518974617, "eval_loss": 0.87410569190979, "eval_runtime": 130.959, "eval_samples_per_second": 30.384, "eval_steps_per_second": 3.803, "step": 1200 }, { "epoch": 0.56, "grad_norm": 3.0782244205474854, "learning_rate": 0.00017194895591647334, "loss": 0.8603, "step": 1210 }, { "epoch": 0.57, "grad_norm": 2.4333736896514893, "learning_rate": 0.00017171693735498842, "loss": 0.779, "step": 1220 }, { "epoch": 0.57, "grad_norm": 3.9308993816375732, "learning_rate": 0.00017148491879350348, "loss": 0.7695, "step": 1230 }, { "epoch": 0.58, "grad_norm": 2.4168589115142822, "learning_rate": 0.00017125290023201856, "loss": 0.8655, "step": 1240 }, { "epoch": 0.58, "grad_norm": 3.680983304977417, "learning_rate": 0.00017102088167053365, "loss": 0.7862, "step": 1250 }, { "epoch": 0.58, "grad_norm": 3.8315815925598145, "learning_rate": 0.00017078886310904873, "loss": 0.7972, "step": 1260 }, { "epoch": 0.59, "grad_norm": 1.910196304321289, "learning_rate": 0.0001705568445475638, "loss": 0.7454, "step": 1270 }, { "epoch": 0.59, "grad_norm": 1.9004689455032349, "learning_rate": 0.0001703248259860789, "loss": 0.7709, "step": 1280 }, { "epoch": 0.6, "grad_norm": 3.2291324138641357, "learning_rate": 0.00017009280742459398, "loss": 0.7085, "step": 1290 }, { "epoch": 0.6, "grad_norm": 2.6493847370147705, "learning_rate": 0.00016986078886310906, "loss": 0.8226, "step": 1300 }, { "epoch": 0.6, "eval_accuracy": 0.6659964815280222, "eval_loss": 0.8283097743988037, "eval_runtime": 130.9921, "eval_samples_per_second": 30.376, "eval_steps_per_second": 3.802, "step": 1300 }, { "epoch": 0.61, "grad_norm": 4.123025417327881, "learning_rate": 0.00016962877030162415, "loss": 0.9109, "step": 1310 }, { "epoch": 0.61, "grad_norm": 3.537853479385376, "learning_rate": 0.00016939675174013923, "loss": 0.9124, "step": 1320 }, { "epoch": 0.62, "grad_norm": 2.515120506286621, "learning_rate": 0.0001691647331786543, "loss": 0.8357, "step": 1330 }, { "epoch": 0.62, "grad_norm": 1.7295467853546143, "learning_rate": 0.00016893271461716937, "loss": 0.7425, "step": 1340 }, { "epoch": 0.63, "grad_norm": 2.3161306381225586, "learning_rate": 0.00016870069605568445, "loss": 0.7518, "step": 1350 }, { "epoch": 0.63, "grad_norm": 2.593114137649536, "learning_rate": 0.00016846867749419954, "loss": 0.816, "step": 1360 }, { "epoch": 0.64, "grad_norm": 2.4234368801116943, "learning_rate": 0.00016823665893271462, "loss": 0.8256, "step": 1370 }, { "epoch": 0.64, "grad_norm": 2.0647542476654053, "learning_rate": 0.0001680046403712297, "loss": 0.9176, "step": 1380 }, { "epoch": 0.65, "grad_norm": 1.5590307712554932, "learning_rate": 0.00016777262180974479, "loss": 0.7476, "step": 1390 }, { "epoch": 0.65, "grad_norm": 2.5730812549591064, "learning_rate": 0.00016754060324825987, "loss": 0.7738, "step": 1400 }, { "epoch": 0.65, "eval_accuracy": 0.6401105805478764, "eval_loss": 0.8641374111175537, "eval_runtime": 131.4185, "eval_samples_per_second": 30.277, "eval_steps_per_second": 3.789, "step": 1400 }, { "epoch": 0.65, "grad_norm": 3.080822467803955, "learning_rate": 0.00016730858468677495, "loss": 0.8131, "step": 1410 }, { "epoch": 0.66, "grad_norm": 3.1145131587982178, "learning_rate": 0.00016707656612529004, "loss": 0.8048, "step": 1420 }, { "epoch": 0.66, "grad_norm": 2.4306788444519043, "learning_rate": 0.00016684454756380512, "loss": 0.7241, "step": 1430 }, { "epoch": 0.67, "grad_norm": 3.0480475425720215, "learning_rate": 0.00016661252900232018, "loss": 0.6803, "step": 1440 }, { "epoch": 0.67, "grad_norm": 2.5454821586608887, "learning_rate": 0.00016638051044083526, "loss": 0.773, "step": 1450 }, { "epoch": 0.68, "grad_norm": 2.2483272552490234, "learning_rate": 0.00016614849187935034, "loss": 0.8333, "step": 1460 }, { "epoch": 0.68, "grad_norm": 1.9373365640640259, "learning_rate": 0.00016591647331786543, "loss": 0.7289, "step": 1470 }, { "epoch": 0.69, "grad_norm": 2.8379623889923096, "learning_rate": 0.00016568445475638054, "loss": 0.7733, "step": 1480 }, { "epoch": 0.69, "grad_norm": 2.349510431289673, "learning_rate": 0.00016545243619489562, "loss": 0.8449, "step": 1490 }, { "epoch": 0.7, "grad_norm": 4.029337406158447, "learning_rate": 0.00016522041763341068, "loss": 0.8427, "step": 1500 }, { "epoch": 0.7, "eval_accuracy": 0.6725307866298065, "eval_loss": 0.803027331829071, "eval_runtime": 131.7174, "eval_samples_per_second": 30.209, "eval_steps_per_second": 3.781, "step": 1500 }, { "epoch": 0.7, "grad_norm": 2.8301985263824463, "learning_rate": 0.00016498839907192576, "loss": 0.7437, "step": 1510 }, { "epoch": 0.71, "grad_norm": 2.7581472396850586, "learning_rate": 0.00016475638051044084, "loss": 0.7888, "step": 1520 }, { "epoch": 0.71, "grad_norm": 2.044255256652832, "learning_rate": 0.00016452436194895593, "loss": 0.7, "step": 1530 }, { "epoch": 0.71, "grad_norm": 4.427280426025391, "learning_rate": 0.000164292343387471, "loss": 0.8854, "step": 1540 }, { "epoch": 0.72, "grad_norm": 3.044015884399414, "learning_rate": 0.00016406032482598607, "loss": 0.677, "step": 1550 }, { "epoch": 0.72, "grad_norm": 2.954887628555298, "learning_rate": 0.00016382830626450115, "loss": 0.7198, "step": 1560 }, { "epoch": 0.73, "grad_norm": 2.2452878952026367, "learning_rate": 0.00016359628770301626, "loss": 0.7495, "step": 1570 }, { "epoch": 0.73, "grad_norm": 2.0875964164733887, "learning_rate": 0.00016336426914153134, "loss": 0.8106, "step": 1580 }, { "epoch": 0.74, "grad_norm": 1.8363144397735596, "learning_rate": 0.00016313225058004643, "loss": 0.6737, "step": 1590 }, { "epoch": 0.74, "grad_norm": 3.34063982963562, "learning_rate": 0.00016290023201856148, "loss": 0.6783, "step": 1600 }, { "epoch": 0.74, "eval_accuracy": 0.6564463433023373, "eval_loss": 0.8367487192153931, "eval_runtime": 129.8582, "eval_samples_per_second": 30.641, "eval_steps_per_second": 3.835, "step": 1600 }, { "epoch": 0.75, "grad_norm": 4.197628974914551, "learning_rate": 0.00016266821345707657, "loss": 0.7794, "step": 1610 }, { "epoch": 0.75, "grad_norm": 2.9976580142974854, "learning_rate": 0.00016243619489559165, "loss": 0.832, "step": 1620 }, { "epoch": 0.76, "grad_norm": 2.8508596420288086, "learning_rate": 0.00016220417633410673, "loss": 0.86, "step": 1630 }, { "epoch": 0.76, "grad_norm": 2.7021024227142334, "learning_rate": 0.00016197215777262182, "loss": 0.7531, "step": 1640 }, { "epoch": 0.77, "grad_norm": 2.3222107887268066, "learning_rate": 0.0001617401392111369, "loss": 0.7338, "step": 1650 }, { "epoch": 0.77, "grad_norm": 2.1219635009765625, "learning_rate": 0.00016150812064965198, "loss": 0.8965, "step": 1660 }, { "epoch": 0.77, "grad_norm": 11.041630744934082, "learning_rate": 0.00016127610208816707, "loss": 0.7939, "step": 1670 }, { "epoch": 0.78, "grad_norm": 2.4006307125091553, "learning_rate": 0.00016104408352668215, "loss": 0.736, "step": 1680 }, { "epoch": 0.78, "grad_norm": 2.535405158996582, "learning_rate": 0.00016081206496519723, "loss": 0.7982, "step": 1690 }, { "epoch": 0.79, "grad_norm": 2.8077518939971924, "learning_rate": 0.00016058004640371232, "loss": 0.7856, "step": 1700 }, { "epoch": 0.79, "eval_accuracy": 0.6051771801960292, "eval_loss": 0.9696215391159058, "eval_runtime": 130.9284, "eval_samples_per_second": 30.391, "eval_steps_per_second": 3.804, "step": 1700 }, { "epoch": 0.79, "grad_norm": 2.17937970161438, "learning_rate": 0.00016034802784222737, "loss": 0.8074, "step": 1710 }, { "epoch": 0.8, "grad_norm": 3.2899444103240967, "learning_rate": 0.00016011600928074246, "loss": 0.8416, "step": 1720 }, { "epoch": 0.8, "grad_norm": 3.247441530227661, "learning_rate": 0.00015988399071925754, "loss": 0.8302, "step": 1730 }, { "epoch": 0.81, "grad_norm": 2.508978843688965, "learning_rate": 0.00015965197215777262, "loss": 0.7192, "step": 1740 }, { "epoch": 0.81, "grad_norm": 3.634054183959961, "learning_rate": 0.0001594199535962877, "loss": 0.7919, "step": 1750 }, { "epoch": 0.82, "grad_norm": 2.7715981006622314, "learning_rate": 0.0001591879350348028, "loss": 0.703, "step": 1760 }, { "epoch": 0.82, "grad_norm": 1.9510867595672607, "learning_rate": 0.00015895591647331787, "loss": 0.7246, "step": 1770 }, { "epoch": 0.83, "grad_norm": 2.5826807022094727, "learning_rate": 0.00015872389791183296, "loss": 0.8291, "step": 1780 }, { "epoch": 0.83, "grad_norm": 2.8682587146759033, "learning_rate": 0.00015849187935034804, "loss": 0.7284, "step": 1790 }, { "epoch": 0.84, "grad_norm": 2.7725648880004883, "learning_rate": 0.00015825986078886313, "loss": 0.7356, "step": 1800 }, { "epoch": 0.84, "eval_accuracy": 0.6516712741894949, "eval_loss": 0.857125461101532, "eval_runtime": 130.6056, "eval_samples_per_second": 30.466, "eval_steps_per_second": 3.813, "step": 1800 }, { "epoch": 0.84, "grad_norm": 1.756324291229248, "learning_rate": 0.0001580278422273782, "loss": 0.8463, "step": 1810 }, { "epoch": 0.84, "grad_norm": 1.613060712814331, "learning_rate": 0.00015779582366589326, "loss": 0.7517, "step": 1820 }, { "epoch": 0.85, "grad_norm": 3.3475732803344727, "learning_rate": 0.00015756380510440835, "loss": 0.835, "step": 1830 }, { "epoch": 0.85, "grad_norm": 2.118978977203369, "learning_rate": 0.00015733178654292343, "loss": 0.8143, "step": 1840 }, { "epoch": 0.86, "grad_norm": 2.3323171138763428, "learning_rate": 0.00015709976798143852, "loss": 0.8139, "step": 1850 }, { "epoch": 0.86, "grad_norm": 2.580026865005493, "learning_rate": 0.00015686774941995363, "loss": 0.8665, "step": 1860 }, { "epoch": 0.87, "grad_norm": 2.8367908000946045, "learning_rate": 0.00015663573085846868, "loss": 0.7187, "step": 1870 }, { "epoch": 0.87, "grad_norm": 3.431257724761963, "learning_rate": 0.00015640371229698377, "loss": 0.8616, "step": 1880 }, { "epoch": 0.88, "grad_norm": 3.8367366790771484, "learning_rate": 0.00015617169373549885, "loss": 0.7892, "step": 1890 }, { "epoch": 0.88, "grad_norm": 2.648777723312378, "learning_rate": 0.00015593967517401393, "loss": 0.9186, "step": 1900 }, { "epoch": 0.88, "eval_accuracy": 0.6675043980899723, "eval_loss": 0.8260459899902344, "eval_runtime": 131.0193, "eval_samples_per_second": 30.37, "eval_steps_per_second": 3.801, "step": 1900 }, { "epoch": 0.89, "grad_norm": 2.5578160285949707, "learning_rate": 0.00015570765661252902, "loss": 0.6849, "step": 1910 }, { "epoch": 0.89, "grad_norm": 2.5033838748931885, "learning_rate": 0.00015547563805104407, "loss": 0.6708, "step": 1920 }, { "epoch": 0.9, "grad_norm": 2.074505090713501, "learning_rate": 0.00015524361948955916, "loss": 0.717, "step": 1930 }, { "epoch": 0.9, "grad_norm": 2.335425853729248, "learning_rate": 0.00015501160092807424, "loss": 0.9028, "step": 1940 }, { "epoch": 0.9, "grad_norm": 3.3634660243988037, "learning_rate": 0.00015477958236658935, "loss": 0.6975, "step": 1950 }, { "epoch": 0.91, "grad_norm": 2.022599697113037, "learning_rate": 0.00015454756380510443, "loss": 0.6303, "step": 1960 }, { "epoch": 0.91, "grad_norm": 4.197246551513672, "learning_rate": 0.0001543155452436195, "loss": 0.7147, "step": 1970 }, { "epoch": 0.92, "grad_norm": 3.748758554458618, "learning_rate": 0.00015408352668213457, "loss": 0.7944, "step": 1980 }, { "epoch": 0.92, "grad_norm": 2.029123067855835, "learning_rate": 0.00015385150812064966, "loss": 0.7363, "step": 1990 }, { "epoch": 0.93, "grad_norm": 2.3515162467956543, "learning_rate": 0.00015361948955916474, "loss": 0.8218, "step": 2000 }, { "epoch": 0.93, "eval_accuracy": 0.654938426740387, "eval_loss": 0.8351722359657288, "eval_runtime": 129.1873, "eval_samples_per_second": 30.8, "eval_steps_per_second": 3.855, "step": 2000 }, { "epoch": 0.93, "grad_norm": 3.3627982139587402, "learning_rate": 0.00015338747099767982, "loss": 0.7137, "step": 2010 }, { "epoch": 0.94, "grad_norm": 3.0946731567382812, "learning_rate": 0.0001531554524361949, "loss": 0.8327, "step": 2020 }, { "epoch": 0.94, "grad_norm": 1.9171329736709595, "learning_rate": 0.00015292343387470996, "loss": 0.7805, "step": 2030 }, { "epoch": 0.95, "grad_norm": 3.749093532562256, "learning_rate": 0.00015269141531322507, "loss": 0.8531, "step": 2040 }, { "epoch": 0.95, "grad_norm": 2.960636615753174, "learning_rate": 0.00015245939675174016, "loss": 0.7838, "step": 2050 }, { "epoch": 0.96, "grad_norm": 2.5994982719421387, "learning_rate": 0.00015222737819025524, "loss": 0.6932, "step": 2060 }, { "epoch": 0.96, "grad_norm": 2.6657791137695312, "learning_rate": 0.00015199535962877032, "loss": 0.731, "step": 2070 }, { "epoch": 0.97, "grad_norm": 1.7149091958999634, "learning_rate": 0.00015176334106728538, "loss": 0.8234, "step": 2080 }, { "epoch": 0.97, "grad_norm": 1.5878645181655884, "learning_rate": 0.00015153132250580046, "loss": 0.7098, "step": 2090 }, { "epoch": 0.97, "grad_norm": 1.8897655010223389, "learning_rate": 0.00015129930394431555, "loss": 0.6245, "step": 2100 }, { "epoch": 0.97, "eval_accuracy": 0.6763005780346821, "eval_loss": 0.7993046045303345, "eval_runtime": 133.106, "eval_samples_per_second": 29.893, "eval_steps_per_second": 3.741, "step": 2100 }, { "epoch": 0.98, "grad_norm": 3.2624616622924805, "learning_rate": 0.00015106728538283063, "loss": 0.8209, "step": 2110 }, { "epoch": 0.98, "grad_norm": 2.469926595687866, "learning_rate": 0.00015083526682134571, "loss": 0.7127, "step": 2120 }, { "epoch": 0.99, "grad_norm": 3.8582072257995605, "learning_rate": 0.0001506032482598608, "loss": 0.655, "step": 2130 }, { "epoch": 0.99, "grad_norm": 3.1348557472229004, "learning_rate": 0.00015037122969837588, "loss": 0.8291, "step": 2140 }, { "epoch": 1.0, "grad_norm": 3.1626625061035156, "learning_rate": 0.00015013921113689096, "loss": 0.8373, "step": 2150 }, { "epoch": 1.0, "grad_norm": 1.9470633268356323, "learning_rate": 0.00014990719257540605, "loss": 0.6225, "step": 2160 }, { "epoch": 1.01, "grad_norm": 2.336871862411499, "learning_rate": 0.00014967517401392113, "loss": 0.5857, "step": 2170 }, { "epoch": 1.01, "grad_norm": 1.737004280090332, "learning_rate": 0.00014944315545243621, "loss": 0.5935, "step": 2180 }, { "epoch": 1.02, "grad_norm": 2.336336612701416, "learning_rate": 0.00014921113689095127, "loss": 0.5824, "step": 2190 }, { "epoch": 1.02, "grad_norm": 2.338193655014038, "learning_rate": 0.00014897911832946635, "loss": 0.4945, "step": 2200 }, { "epoch": 1.02, "eval_accuracy": 0.6589595375722543, "eval_loss": 0.8315911889076233, "eval_runtime": 132.4435, "eval_samples_per_second": 30.043, "eval_steps_per_second": 3.76, "step": 2200 }, { "epoch": 1.03, "grad_norm": 2.667480707168579, "learning_rate": 0.00014874709976798144, "loss": 0.6186, "step": 2210 }, { "epoch": 1.03, "grad_norm": 2.019312858581543, "learning_rate": 0.00014851508120649652, "loss": 0.5037, "step": 2220 }, { "epoch": 1.03, "grad_norm": 2.4240450859069824, "learning_rate": 0.0001482830626450116, "loss": 0.5781, "step": 2230 }, { "epoch": 1.04, "grad_norm": 2.2333035469055176, "learning_rate": 0.0001480510440835267, "loss": 0.4923, "step": 2240 }, { "epoch": 1.04, "grad_norm": 2.023408889770508, "learning_rate": 0.00014781902552204177, "loss": 0.5855, "step": 2250 }, { "epoch": 1.05, "grad_norm": 2.4406158924102783, "learning_rate": 0.00014758700696055685, "loss": 0.5579, "step": 2260 }, { "epoch": 1.05, "grad_norm": 3.5192463397979736, "learning_rate": 0.00014735498839907194, "loss": 0.6066, "step": 2270 }, { "epoch": 1.06, "grad_norm": 4.174234390258789, "learning_rate": 0.00014712296983758702, "loss": 0.6238, "step": 2280 }, { "epoch": 1.06, "grad_norm": 2.916022539138794, "learning_rate": 0.00014689095127610208, "loss": 0.5475, "step": 2290 }, { "epoch": 1.07, "grad_norm": 3.1607933044433594, "learning_rate": 0.00014665893271461716, "loss": 0.6064, "step": 2300 }, { "epoch": 1.07, "eval_accuracy": 0.6680070369439558, "eval_loss": 0.8378371596336365, "eval_runtime": 132.7481, "eval_samples_per_second": 29.974, "eval_steps_per_second": 3.751, "step": 2300 }, { "epoch": 1.07, "grad_norm": 1.8557933568954468, "learning_rate": 0.00014642691415313224, "loss": 0.6037, "step": 2310 }, { "epoch": 1.08, "grad_norm": 4.142065048217773, "learning_rate": 0.00014619489559164733, "loss": 0.6552, "step": 2320 }, { "epoch": 1.08, "grad_norm": 3.622699499130249, "learning_rate": 0.00014596287703016244, "loss": 0.4755, "step": 2330 }, { "epoch": 1.09, "grad_norm": 3.9584805965423584, "learning_rate": 0.00014573085846867752, "loss": 0.677, "step": 2340 }, { "epoch": 1.09, "grad_norm": 2.498189926147461, "learning_rate": 0.00014549883990719258, "loss": 0.5519, "step": 2350 }, { "epoch": 1.1, "grad_norm": 5.097834587097168, "learning_rate": 0.00014526682134570766, "loss": 0.6068, "step": 2360 }, { "epoch": 1.1, "grad_norm": 3.7660293579101562, "learning_rate": 0.00014503480278422275, "loss": 0.5356, "step": 2370 }, { "epoch": 1.1, "grad_norm": 3.423243999481201, "learning_rate": 0.00014480278422273783, "loss": 0.6954, "step": 2380 }, { "epoch": 1.11, "grad_norm": 3.099900484085083, "learning_rate": 0.0001445707656612529, "loss": 0.4953, "step": 2390 }, { "epoch": 1.11, "grad_norm": 4.438780784606934, "learning_rate": 0.00014433874709976797, "loss": 0.638, "step": 2400 }, { "epoch": 1.11, "eval_accuracy": 0.6828348831364665, "eval_loss": 0.8223534822463989, "eval_runtime": 130.5252, "eval_samples_per_second": 30.485, "eval_steps_per_second": 3.815, "step": 2400 }, { "epoch": 1.12, "grad_norm": 2.960069417953491, "learning_rate": 0.00014410672853828305, "loss": 0.5635, "step": 2410 }, { "epoch": 1.12, "grad_norm": 1.4343048334121704, "learning_rate": 0.00014387470997679816, "loss": 0.5937, "step": 2420 }, { "epoch": 1.13, "grad_norm": 1.8916206359863281, "learning_rate": 0.00014364269141531325, "loss": 0.5543, "step": 2430 }, { "epoch": 1.13, "grad_norm": 2.315703868865967, "learning_rate": 0.00014341067285382833, "loss": 0.4934, "step": 2440 }, { "epoch": 1.14, "grad_norm": 2.3009326457977295, "learning_rate": 0.00014317865429234339, "loss": 0.5487, "step": 2450 }, { "epoch": 1.14, "grad_norm": 2.4745514392852783, "learning_rate": 0.00014294663573085847, "loss": 0.6988, "step": 2460 }, { "epoch": 1.15, "grad_norm": 2.8443727493286133, "learning_rate": 0.00014271461716937355, "loss": 0.5079, "step": 2470 }, { "epoch": 1.15, "grad_norm": 3.124251127243042, "learning_rate": 0.00014248259860788864, "loss": 0.5499, "step": 2480 }, { "epoch": 1.16, "grad_norm": 3.0896270275115967, "learning_rate": 0.00014225058004640372, "loss": 0.6026, "step": 2490 }, { "epoch": 1.16, "grad_norm": 2.8856897354125977, "learning_rate": 0.0001420185614849188, "loss": 0.6253, "step": 2500 }, { "epoch": 1.16, "eval_accuracy": 0.6617240512691631, "eval_loss": 0.8880072236061096, "eval_runtime": 129.7205, "eval_samples_per_second": 30.674, "eval_steps_per_second": 3.839, "step": 2500 }, { "epoch": 1.16, "step": 2500, "total_flos": 3.099325741767844e+18, "train_loss": 0.8266718128204346, "train_runtime": 5452.5845, "train_samples_per_second": 25.29, "train_steps_per_second": 1.581 } ], "logging_steps": 10, "max_steps": 8620, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 3.099325741767844e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }