{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.762511493497964, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.052541704978326546, "grad_norm": 0.744549572467804, "learning_rate": 1.6666666666666667e-06, "loss": 1.663, "step": 100 }, { "epoch": 0.10508340995665309, "grad_norm": 0.6462538838386536, "learning_rate": 3.3333333333333333e-06, "loss": 1.616, "step": 200 }, { "epoch": 0.15762511493497963, "grad_norm": 0.5421179533004761, "learning_rate": 5e-06, "loss": 1.4549, "step": 300 }, { "epoch": 0.21016681991330619, "grad_norm": 0.8848033547401428, "learning_rate": 6.666666666666667e-06, "loss": 1.3941, "step": 400 }, { "epoch": 0.2627085248916327, "grad_norm": 0.7000551819801331, "learning_rate": 8.333333333333334e-06, "loss": 1.2752, "step": 500 }, { "epoch": 0.2627085248916327, "eval_loss": 1.0633598566055298, "eval_runtime": 1.3002, "eval_samples_per_second": 7.691, "eval_steps_per_second": 3.845, "step": 500 }, { "epoch": 0.31525022986995926, "grad_norm": 0.8002354502677917, "learning_rate": 1e-05, "loss": 1.2205, "step": 600 }, { "epoch": 0.3677919348482858, "grad_norm": 0.7082624435424805, "learning_rate": 1.1666666666666668e-05, "loss": 1.188, "step": 700 }, { "epoch": 0.42033363982661237, "grad_norm": 0.626582682132721, "learning_rate": 1.3333333333333333e-05, "loss": 1.1647, "step": 800 }, { "epoch": 0.4728753448049389, "grad_norm": 1.0509939193725586, "learning_rate": 1.5e-05, "loss": 1.1427, "step": 900 }, { "epoch": 0.5254170497832654, "grad_norm": 1.1202670335769653, "learning_rate": 1.6666666666666667e-05, "loss": 1.144, "step": 1000 }, { "epoch": 0.5254170497832654, "eval_loss": 0.9546993374824524, "eval_runtime": 1.2391, "eval_samples_per_second": 8.07, "eval_steps_per_second": 4.035, "step": 1000 }, { "epoch": 0.577958754761592, "grad_norm": 1.1231364011764526, "learning_rate": 1.8333333333333333e-05, "loss": 1.0935, "step": 1100 }, { "epoch": 0.6305004597399185, "grad_norm": 1.0733870267868042, "learning_rate": 2e-05, "loss": 1.1038, "step": 1200 }, { "epoch": 0.6830421647182451, "grad_norm": 1.0306121110916138, "learning_rate": 2.1666666666666667e-05, "loss": 1.1055, "step": 1300 }, { "epoch": 0.7355838696965716, "grad_norm": 1.2776762247085571, "learning_rate": 2.3333333333333336e-05, "loss": 1.0724, "step": 1400 }, { "epoch": 0.7881255746748982, "grad_norm": 1.0678656101226807, "learning_rate": 2.5e-05, "loss": 1.0564, "step": 1500 }, { "epoch": 0.7881255746748982, "eval_loss": 0.912558913230896, "eval_runtime": 1.2576, "eval_samples_per_second": 7.952, "eval_steps_per_second": 3.976, "step": 1500 }, { "epoch": 0.8406672796532247, "grad_norm": 1.6063746213912964, "learning_rate": 2.6666666666666667e-05, "loss": 1.0391, "step": 1600 }, { "epoch": 0.8932089846315513, "grad_norm": 1.367933750152588, "learning_rate": 2.8333333333333335e-05, "loss": 1.0435, "step": 1700 }, { "epoch": 0.9457506896098778, "grad_norm": 1.0610206127166748, "learning_rate": 3e-05, "loss": 1.0436, "step": 1800 }, { "epoch": 0.9982923945882044, "grad_norm": 0.8625739216804504, "learning_rate": 3.1666666666666666e-05, "loss": 1.0334, "step": 1900 }, { "epoch": 1.0508340995665308, "grad_norm": 1.4904906749725342, "learning_rate": 3.3333333333333335e-05, "loss": 1.0228, "step": 2000 }, { "epoch": 1.0508340995665308, "eval_loss": 0.8912132978439331, "eval_runtime": 1.2665, "eval_samples_per_second": 7.896, "eval_steps_per_second": 3.948, "step": 2000 }, { "epoch": 1.1033758045448574, "grad_norm": 1.4299525022506714, "learning_rate": 3.5e-05, "loss": 1.0009, "step": 2100 }, { "epoch": 1.155917509523184, "grad_norm": 1.2776519060134888, "learning_rate": 3.6666666666666666e-05, "loss": 0.9988, "step": 2200 }, { "epoch": 1.2084592145015105, "grad_norm": 1.1559321880340576, "learning_rate": 3.8333333333333334e-05, "loss": 1.0003, "step": 2300 }, { "epoch": 1.261000919479837, "grad_norm": 0.9451723098754883, "learning_rate": 4e-05, "loss": 0.9961, "step": 2400 }, { "epoch": 1.3135426244581636, "grad_norm": 1.4040967226028442, "learning_rate": 4.166666666666667e-05, "loss": 0.9981, "step": 2500 }, { "epoch": 1.3135426244581636, "eval_loss": 0.8769587278366089, "eval_runtime": 1.2527, "eval_samples_per_second": 7.983, "eval_steps_per_second": 3.991, "step": 2500 }, { "epoch": 1.3660843294364902, "grad_norm": 1.469843864440918, "learning_rate": 4.3333333333333334e-05, "loss": 0.9868, "step": 2600 }, { "epoch": 1.4186260344148167, "grad_norm": 1.1479499340057373, "learning_rate": 4.5e-05, "loss": 0.9822, "step": 2700 }, { "epoch": 1.4711677393931433, "grad_norm": 2.1333274841308594, "learning_rate": 4.666666666666667e-05, "loss": 0.9921, "step": 2800 }, { "epoch": 1.5237094443714698, "grad_norm": 1.162014365196228, "learning_rate": 4.8333333333333334e-05, "loss": 0.9813, "step": 2900 }, { "epoch": 1.5762511493497964, "grad_norm": 0.9883254170417786, "learning_rate": 5e-05, "loss": 0.9796, "step": 3000 }, { "epoch": 1.5762511493497964, "eval_loss": 0.8728264570236206, "eval_runtime": 1.262, "eval_samples_per_second": 7.924, "eval_steps_per_second": 3.962, "step": 3000 }, { "epoch": 1.628792854328123, "grad_norm": 1.1366589069366455, "learning_rate": 4.981481481481482e-05, "loss": 0.985, "step": 3100 }, { "epoch": 1.6813345593064495, "grad_norm": 1.0342519283294678, "learning_rate": 4.962962962962963e-05, "loss": 0.9872, "step": 3200 }, { "epoch": 1.733876264284776, "grad_norm": 0.9889224767684937, "learning_rate": 4.9444444444444446e-05, "loss": 0.9892, "step": 3300 }, { "epoch": 1.7864179692631026, "grad_norm": 0.9875237345695496, "learning_rate": 4.925925925925926e-05, "loss": 0.9606, "step": 3400 }, { "epoch": 1.8389596742414291, "grad_norm": 0.9079448580741882, "learning_rate": 4.9074074074074075e-05, "loss": 0.9846, "step": 3500 }, { "epoch": 1.8389596742414291, "eval_loss": 0.869242787361145, "eval_runtime": 1.2422, "eval_samples_per_second": 8.05, "eval_steps_per_second": 4.025, "step": 3500 }, { "epoch": 1.8915013792197557, "grad_norm": 1.0478876829147339, "learning_rate": 4.888888888888889e-05, "loss": 0.9616, "step": 3600 }, { "epoch": 1.9440430841980822, "grad_norm": 1.0536926984786987, "learning_rate": 4.8703703703703704e-05, "loss": 0.9674, "step": 3700 }, { "epoch": 1.9965847891764088, "grad_norm": 1.1787534952163696, "learning_rate": 4.851851851851852e-05, "loss": 0.9567, "step": 3800 }, { "epoch": 2.049126494154735, "grad_norm": 1.0072757005691528, "learning_rate": 4.8333333333333334e-05, "loss": 0.9441, "step": 3900 }, { "epoch": 2.1016681991330617, "grad_norm": 1.2437305450439453, "learning_rate": 4.814814814814815e-05, "loss": 0.9316, "step": 4000 }, { "epoch": 2.1016681991330617, "eval_loss": 0.8525739908218384, "eval_runtime": 1.2391, "eval_samples_per_second": 8.07, "eval_steps_per_second": 4.035, "step": 4000 }, { "epoch": 2.1542099041113882, "grad_norm": 1.722901701927185, "learning_rate": 4.796296296296296e-05, "loss": 0.934, "step": 4100 }, { "epoch": 2.206751609089715, "grad_norm": 1.032166600227356, "learning_rate": 4.7777777777777784e-05, "loss": 0.9324, "step": 4200 }, { "epoch": 2.2592933140680413, "grad_norm": 1.0329689979553223, "learning_rate": 4.759259259259259e-05, "loss": 0.9327, "step": 4300 }, { "epoch": 2.311835019046368, "grad_norm": 0.8714644908905029, "learning_rate": 4.740740740740741e-05, "loss": 0.9228, "step": 4400 }, { "epoch": 2.3643767240246945, "grad_norm": 1.048893928527832, "learning_rate": 4.722222222222222e-05, "loss": 0.9356, "step": 4500 }, { "epoch": 2.3643767240246945, "eval_loss": 0.8477146029472351, "eval_runtime": 1.2917, "eval_samples_per_second": 7.742, "eval_steps_per_second": 3.871, "step": 4500 }, { "epoch": 2.416918429003021, "grad_norm": 1.1038933992385864, "learning_rate": 4.703703703703704e-05, "loss": 0.9317, "step": 4600 }, { "epoch": 2.4694601339813476, "grad_norm": 0.9085045456886292, "learning_rate": 4.685185185185185e-05, "loss": 0.9411, "step": 4700 }, { "epoch": 2.522001838959674, "grad_norm": 1.1415351629257202, "learning_rate": 4.666666666666667e-05, "loss": 0.9227, "step": 4800 }, { "epoch": 2.5745435439380007, "grad_norm": 0.9230244755744934, "learning_rate": 4.648148148148148e-05, "loss": 0.9362, "step": 4900 }, { "epoch": 2.627085248916327, "grad_norm": 0.9721058011054993, "learning_rate": 4.62962962962963e-05, "loss": 0.9297, "step": 5000 }, { "epoch": 2.627085248916327, "eval_loss": 0.8504465818405151, "eval_runtime": 1.2834, "eval_samples_per_second": 7.792, "eval_steps_per_second": 3.896, "step": 5000 }, { "epoch": 2.6796269538946538, "grad_norm": 0.933588981628418, "learning_rate": 4.6111111111111115e-05, "loss": 0.9339, "step": 5100 }, { "epoch": 2.7321686588729803, "grad_norm": 0.915013313293457, "learning_rate": 4.592592592592593e-05, "loss": 0.9322, "step": 5200 }, { "epoch": 2.784710363851307, "grad_norm": 1.2846601009368896, "learning_rate": 4.5740740740740745e-05, "loss": 0.913, "step": 5300 }, { "epoch": 2.8372520688296334, "grad_norm": 1.1330525875091553, "learning_rate": 4.555555555555556e-05, "loss": 0.9241, "step": 5400 }, { "epoch": 2.88979377380796, "grad_norm": 1.6967169046401978, "learning_rate": 4.5370370370370374e-05, "loss": 0.9127, "step": 5500 }, { "epoch": 2.88979377380796, "eval_loss": 0.8335689306259155, "eval_runtime": 1.2838, "eval_samples_per_second": 7.789, "eval_steps_per_second": 3.895, "step": 5500 }, { "epoch": 2.9423354787862865, "grad_norm": 0.9576010704040527, "learning_rate": 4.518518518518519e-05, "loss": 0.9097, "step": 5600 }, { "epoch": 2.994877183764613, "grad_norm": 1.229671835899353, "learning_rate": 4.5e-05, "loss": 0.9165, "step": 5700 }, { "epoch": 3.0474188887429396, "grad_norm": 0.9229328632354736, "learning_rate": 4.481481481481482e-05, "loss": 0.9066, "step": 5800 }, { "epoch": 3.099960593721266, "grad_norm": 1.1152088642120361, "learning_rate": 4.462962962962963e-05, "loss": 0.8888, "step": 5900 }, { "epoch": 3.1525022986995928, "grad_norm": 1.0948377847671509, "learning_rate": 4.4444444444444447e-05, "loss": 0.9233, "step": 6000 }, { "epoch": 3.1525022986995928, "eval_loss": 0.8405929803848267, "eval_runtime": 1.2669, "eval_samples_per_second": 7.893, "eval_steps_per_second": 3.947, "step": 6000 }, { "epoch": 3.2050440036779193, "grad_norm": 1.320657730102539, "learning_rate": 4.425925925925926e-05, "loss": 0.9024, "step": 6100 }, { "epoch": 3.257585708656246, "grad_norm": 1.3515260219573975, "learning_rate": 4.4074074074074076e-05, "loss": 0.8992, "step": 6200 }, { "epoch": 3.3101274136345724, "grad_norm": 0.9061586260795593, "learning_rate": 4.388888888888889e-05, "loss": 0.8966, "step": 6300 }, { "epoch": 3.362669118612899, "grad_norm": 0.7893356084823608, "learning_rate": 4.3703703703703705e-05, "loss": 0.8772, "step": 6400 }, { "epoch": 3.4152108235912255, "grad_norm": 1.1485052108764648, "learning_rate": 4.351851851851852e-05, "loss": 0.8935, "step": 6500 }, { "epoch": 3.4152108235912255, "eval_loss": 0.8316594362258911, "eval_runtime": 1.2559, "eval_samples_per_second": 7.962, "eval_steps_per_second": 3.981, "step": 6500 }, { "epoch": 3.467752528569552, "grad_norm": 1.2252397537231445, "learning_rate": 4.3333333333333334e-05, "loss": 0.8877, "step": 6600 }, { "epoch": 3.5202942335478786, "grad_norm": 0.937276303768158, "learning_rate": 4.314814814814815e-05, "loss": 0.899, "step": 6700 }, { "epoch": 3.572835938526205, "grad_norm": 1.1383343935012817, "learning_rate": 4.296296296296296e-05, "loss": 0.884, "step": 6800 }, { "epoch": 3.6253776435045317, "grad_norm": 0.9624786376953125, "learning_rate": 4.277777777777778e-05, "loss": 0.8933, "step": 6900 }, { "epoch": 3.6779193484828583, "grad_norm": 1.2297359704971313, "learning_rate": 4.259259259259259e-05, "loss": 0.8902, "step": 7000 }, { "epoch": 3.6779193484828583, "eval_loss": 0.8357389569282532, "eval_runtime": 1.2564, "eval_samples_per_second": 7.959, "eval_steps_per_second": 3.979, "step": 7000 }, { "epoch": 3.730461053461185, "grad_norm": 1.3117122650146484, "learning_rate": 4.240740740740741e-05, "loss": 0.8833, "step": 7100 }, { "epoch": 3.7830027584395114, "grad_norm": 1.0405583381652832, "learning_rate": 4.222222222222222e-05, "loss": 0.8974, "step": 7200 }, { "epoch": 3.835544463417838, "grad_norm": 1.1837128400802612, "learning_rate": 4.203703703703704e-05, "loss": 0.9108, "step": 7300 }, { "epoch": 3.8880861683961645, "grad_norm": 1.070272445678711, "learning_rate": 4.185185185185185e-05, "loss": 0.888, "step": 7400 }, { "epoch": 3.940627873374491, "grad_norm": 1.2699558734893799, "learning_rate": 4.166666666666667e-05, "loss": 0.8996, "step": 7500 }, { "epoch": 3.940627873374491, "eval_loss": 0.8259842991828918, "eval_runtime": 1.2588, "eval_samples_per_second": 7.944, "eval_steps_per_second": 3.972, "step": 7500 }, { "epoch": 3.9931695783528176, "grad_norm": 1.1410374641418457, "learning_rate": 4.148148148148148e-05, "loss": 0.8931, "step": 7600 }, { "epoch": 4.045711283331144, "grad_norm": 0.8494368195533752, "learning_rate": 4.12962962962963e-05, "loss": 0.8735, "step": 7700 }, { "epoch": 4.09825298830947, "grad_norm": 1.6662884950637817, "learning_rate": 4.111111111111111e-05, "loss": 0.8706, "step": 7800 }, { "epoch": 4.150794693287797, "grad_norm": 1.103721022605896, "learning_rate": 4.092592592592593e-05, "loss": 0.8868, "step": 7900 }, { "epoch": 4.203336398266123, "grad_norm": 1.0457319021224976, "learning_rate": 4.074074074074074e-05, "loss": 0.8616, "step": 8000 }, { "epoch": 4.203336398266123, "eval_loss": 0.8296699523925781, "eval_runtime": 1.2594, "eval_samples_per_second": 7.941, "eval_steps_per_second": 3.97, "step": 8000 }, { "epoch": 4.25587810324445, "grad_norm": 1.038172721862793, "learning_rate": 4.055555555555556e-05, "loss": 0.8814, "step": 8100 }, { "epoch": 4.3084198082227765, "grad_norm": 1.0770879983901978, "learning_rate": 4.0370370370370374e-05, "loss": 0.8781, "step": 8200 }, { "epoch": 4.3609615132011035, "grad_norm": 0.9559742212295532, "learning_rate": 4.018518518518519e-05, "loss": 0.8631, "step": 8300 }, { "epoch": 4.41350321817943, "grad_norm": 1.2352747917175293, "learning_rate": 4e-05, "loss": 0.8619, "step": 8400 }, { "epoch": 4.466044923157757, "grad_norm": 1.0156300067901611, "learning_rate": 3.981481481481482e-05, "loss": 0.8589, "step": 8500 }, { "epoch": 4.466044923157757, "eval_loss": 0.8272676467895508, "eval_runtime": 1.239, "eval_samples_per_second": 8.071, "eval_steps_per_second": 4.035, "step": 8500 }, { "epoch": 4.518586628136083, "grad_norm": 1.3172954320907593, "learning_rate": 3.962962962962963e-05, "loss": 0.8782, "step": 8600 }, { "epoch": 4.57112833311441, "grad_norm": 1.0361146926879883, "learning_rate": 3.944444444444445e-05, "loss": 0.8929, "step": 8700 }, { "epoch": 4.623670038092736, "grad_norm": 1.0141929388046265, "learning_rate": 3.925925925925926e-05, "loss": 0.8701, "step": 8800 }, { "epoch": 4.676211743071063, "grad_norm": 1.4929876327514648, "learning_rate": 3.9074074074074076e-05, "loss": 0.8877, "step": 8900 }, { "epoch": 4.728753448049389, "grad_norm": 1.6893892288208008, "learning_rate": 3.888888888888889e-05, "loss": 0.8751, "step": 9000 }, { "epoch": 4.728753448049389, "eval_loss": 0.822452187538147, "eval_runtime": 1.246, "eval_samples_per_second": 8.026, "eval_steps_per_second": 4.013, "step": 9000 }, { "epoch": 4.781295153027716, "grad_norm": 0.993008017539978, "learning_rate": 3.8703703703703705e-05, "loss": 0.8689, "step": 9100 }, { "epoch": 4.833836858006042, "grad_norm": 0.8024290800094604, "learning_rate": 3.851851851851852e-05, "loss": 0.8945, "step": 9200 }, { "epoch": 4.886378562984369, "grad_norm": 0.8442456722259521, "learning_rate": 3.8333333333333334e-05, "loss": 0.8897, "step": 9300 }, { "epoch": 4.938920267962695, "grad_norm": 1.6505470275878906, "learning_rate": 3.814814814814815e-05, "loss": 0.8481, "step": 9400 }, { "epoch": 4.991461972941022, "grad_norm": 0.9656145572662354, "learning_rate": 3.7962962962962964e-05, "loss": 0.8699, "step": 9500 }, { "epoch": 4.991461972941022, "eval_loss": 0.8228476643562317, "eval_runtime": 1.2383, "eval_samples_per_second": 8.075, "eval_steps_per_second": 4.038, "step": 9500 }, { "epoch": 5.044003677919348, "grad_norm": 1.1603432893753052, "learning_rate": 3.777777777777778e-05, "loss": 0.8731, "step": 9600 }, { "epoch": 5.096545382897675, "grad_norm": 0.8760092854499817, "learning_rate": 3.759259259259259e-05, "loss": 0.8268, "step": 9700 }, { "epoch": 5.149087087876001, "grad_norm": 1.50885009765625, "learning_rate": 3.740740740740741e-05, "loss": 0.8341, "step": 9800 }, { "epoch": 5.201628792854328, "grad_norm": 1.164678692817688, "learning_rate": 3.722222222222222e-05, "loss": 0.8511, "step": 9900 }, { "epoch": 5.254170497832654, "grad_norm": 1.2179008722305298, "learning_rate": 3.7037037037037037e-05, "loss": 0.863, "step": 10000 }, { "epoch": 5.254170497832654, "eval_loss": 0.8227807283401489, "eval_runtime": 1.2397, "eval_samples_per_second": 8.067, "eval_steps_per_second": 4.033, "step": 10000 }, { "epoch": 5.306712202810981, "grad_norm": 1.1593024730682373, "learning_rate": 3.685185185185185e-05, "loss": 0.8481, "step": 10100 }, { "epoch": 5.3592539077893075, "grad_norm": 1.1822712421417236, "learning_rate": 3.6666666666666666e-05, "loss": 0.8435, "step": 10200 }, { "epoch": 5.4117956127676345, "grad_norm": 1.1157796382904053, "learning_rate": 3.648148148148148e-05, "loss": 0.8265, "step": 10300 }, { "epoch": 5.464337317745961, "grad_norm": 1.2513401508331299, "learning_rate": 3.62962962962963e-05, "loss": 0.8916, "step": 10400 }, { "epoch": 5.516879022724288, "grad_norm": 1.07020902633667, "learning_rate": 3.611111111111111e-05, "loss": 0.855, "step": 10500 }, { "epoch": 5.516879022724288, "eval_loss": 0.8260787725448608, "eval_runtime": 1.2426, "eval_samples_per_second": 8.048, "eval_steps_per_second": 4.024, "step": 10500 }, { "epoch": 5.569420727702614, "grad_norm": 1.281821370124817, "learning_rate": 3.592592592592593e-05, "loss": 0.8375, "step": 10600 }, { "epoch": 5.621962432680941, "grad_norm": 1.279232144355774, "learning_rate": 3.574074074074074e-05, "loss": 0.8374, "step": 10700 }, { "epoch": 5.674504137659267, "grad_norm": 1.6183032989501953, "learning_rate": 3.555555555555556e-05, "loss": 0.8396, "step": 10800 }, { "epoch": 5.727045842637594, "grad_norm": 1.5569584369659424, "learning_rate": 3.537037037037037e-05, "loss": 0.8474, "step": 10900 }, { "epoch": 5.77958754761592, "grad_norm": 0.9284697771072388, "learning_rate": 3.518518518518519e-05, "loss": 0.8447, "step": 11000 }, { "epoch": 5.77958754761592, "eval_loss": 0.82489413022995, "eval_runtime": 1.2928, "eval_samples_per_second": 7.735, "eval_steps_per_second": 3.868, "step": 11000 }, { "epoch": 5.832129252594247, "grad_norm": 1.129302740097046, "learning_rate": 3.5e-05, "loss": 0.8425, "step": 11100 }, { "epoch": 5.884670957572573, "grad_norm": 1.1644898653030396, "learning_rate": 3.481481481481482e-05, "loss": 0.8545, "step": 11200 }, { "epoch": 5.9372126625509, "grad_norm": 1.5778745412826538, "learning_rate": 3.4629629629629626e-05, "loss": 0.8488, "step": 11300 }, { "epoch": 5.989754367529226, "grad_norm": 1.0982458591461182, "learning_rate": 3.444444444444445e-05, "loss": 0.8368, "step": 11400 }, { "epoch": 6.042296072507553, "grad_norm": 1.3936185836791992, "learning_rate": 3.425925925925926e-05, "loss": 0.8165, "step": 11500 }, { "epoch": 6.042296072507553, "eval_loss": 0.8229165077209473, "eval_runtime": 1.2812, "eval_samples_per_second": 7.805, "eval_steps_per_second": 3.903, "step": 11500 }, { "epoch": 6.094837777485879, "grad_norm": 0.8609622716903687, "learning_rate": 3.4074074074074077e-05, "loss": 0.813, "step": 11600 }, { "epoch": 6.147379482464206, "grad_norm": 1.75618314743042, "learning_rate": 3.388888888888889e-05, "loss": 0.8285, "step": 11700 }, { "epoch": 6.199921187442532, "grad_norm": 1.5755348205566406, "learning_rate": 3.3703703703703706e-05, "loss": 0.8061, "step": 11800 }, { "epoch": 6.252462892420859, "grad_norm": 1.1621313095092773, "learning_rate": 3.351851851851852e-05, "loss": 0.8349, "step": 11900 }, { "epoch": 6.3050045973991855, "grad_norm": 1.3047136068344116, "learning_rate": 3.3333333333333335e-05, "loss": 0.8247, "step": 12000 }, { "epoch": 6.3050045973991855, "eval_loss": 0.8217476010322571, "eval_runtime": 1.249, "eval_samples_per_second": 8.007, "eval_steps_per_second": 4.003, "step": 12000 }, { "epoch": 6.3575463023775125, "grad_norm": 1.161156415939331, "learning_rate": 3.314814814814815e-05, "loss": 0.8248, "step": 12100 }, { "epoch": 6.410088007355839, "grad_norm": 1.095155954360962, "learning_rate": 3.2962962962962964e-05, "loss": 0.8275, "step": 12200 }, { "epoch": 6.462629712334166, "grad_norm": 1.3355261087417603, "learning_rate": 3.277777777777778e-05, "loss": 0.833, "step": 12300 }, { "epoch": 6.515171417312492, "grad_norm": 1.5305536985397339, "learning_rate": 3.25925925925926e-05, "loss": 0.8259, "step": 12400 }, { "epoch": 6.567713122290819, "grad_norm": 2.5994789600372314, "learning_rate": 3.240740740740741e-05, "loss": 0.8459, "step": 12500 }, { "epoch": 6.567713122290819, "eval_loss": 0.8286579251289368, "eval_runtime": 1.2728, "eval_samples_per_second": 7.856, "eval_steps_per_second": 3.928, "step": 12500 }, { "epoch": 6.620254827269145, "grad_norm": 1.442550539970398, "learning_rate": 3.222222222222223e-05, "loss": 0.8294, "step": 12600 }, { "epoch": 6.672796532247471, "grad_norm": 1.1451183557510376, "learning_rate": 3.203703703703704e-05, "loss": 0.8284, "step": 12700 }, { "epoch": 6.725338237225798, "grad_norm": 1.0634784698486328, "learning_rate": 3.185185185185185e-05, "loss": 0.823, "step": 12800 }, { "epoch": 6.777879942204125, "grad_norm": 1.2910544872283936, "learning_rate": 3.1666666666666666e-05, "loss": 0.8248, "step": 12900 }, { "epoch": 6.830421647182451, "grad_norm": 1.1855148077011108, "learning_rate": 3.148148148148148e-05, "loss": 0.854, "step": 13000 }, { "epoch": 6.830421647182451, "eval_loss": 0.8177132606506348, "eval_runtime": 1.2889, "eval_samples_per_second": 7.759, "eval_steps_per_second": 3.879, "step": 13000 }, { "epoch": 6.882963352160777, "grad_norm": 1.2636213302612305, "learning_rate": 3.1296296296296295e-05, "loss": 0.8429, "step": 13100 }, { "epoch": 6.935505057139104, "grad_norm": 1.2894644737243652, "learning_rate": 3.111111111111111e-05, "loss": 0.8279, "step": 13200 }, { "epoch": 6.988046762117431, "grad_norm": 1.64339280128479, "learning_rate": 3.0925925925925924e-05, "loss": 0.8217, "step": 13300 }, { "epoch": 7.040588467095757, "grad_norm": 0.9605098962783813, "learning_rate": 3.074074074074074e-05, "loss": 0.8375, "step": 13400 }, { "epoch": 7.093130172074083, "grad_norm": 1.4828298091888428, "learning_rate": 3.055555555555556e-05, "loss": 0.8079, "step": 13500 }, { "epoch": 7.093130172074083, "eval_loss": 0.8257712125778198, "eval_runtime": 1.2505, "eval_samples_per_second": 7.997, "eval_steps_per_second": 3.998, "step": 13500 }, { "epoch": 7.14567187705241, "grad_norm": 1.3807919025421143, "learning_rate": 3.037037037037037e-05, "loss": 0.8096, "step": 13600 }, { "epoch": 7.1982135820307365, "grad_norm": 1.5678062438964844, "learning_rate": 3.018518518518519e-05, "loss": 0.7869, "step": 13700 }, { "epoch": 7.2507552870090635, "grad_norm": 1.0710649490356445, "learning_rate": 3e-05, "loss": 0.813, "step": 13800 }, { "epoch": 7.30329699198739, "grad_norm": 1.150152564048767, "learning_rate": 2.981481481481482e-05, "loss": 0.7974, "step": 13900 }, { "epoch": 7.355838696965717, "grad_norm": 1.4336960315704346, "learning_rate": 2.962962962962963e-05, "loss": 0.8126, "step": 14000 }, { "epoch": 7.355838696965717, "eval_loss": 0.8250346183776855, "eval_runtime": 1.2966, "eval_samples_per_second": 7.712, "eval_steps_per_second": 3.856, "step": 14000 }, { "epoch": 7.408380401944043, "grad_norm": 1.454750895500183, "learning_rate": 2.9444444444444448e-05, "loss": 0.8534, "step": 14100 }, { "epoch": 7.46092210692237, "grad_norm": 1.3246815204620361, "learning_rate": 2.925925925925926e-05, "loss": 0.8313, "step": 14200 }, { "epoch": 7.513463811900696, "grad_norm": 1.116432547569275, "learning_rate": 2.9074074074074077e-05, "loss": 0.8085, "step": 14300 }, { "epoch": 7.566005516879023, "grad_norm": 1.0311377048492432, "learning_rate": 2.8888888888888888e-05, "loss": 0.7984, "step": 14400 }, { "epoch": 7.618547221857349, "grad_norm": 1.5071169137954712, "learning_rate": 2.8703703703703706e-05, "loss": 0.817, "step": 14500 }, { "epoch": 7.618547221857349, "eval_loss": 0.8244579434394836, "eval_runtime": 1.2449, "eval_samples_per_second": 8.032, "eval_steps_per_second": 4.016, "step": 14500 }, { "epoch": 7.671088926835676, "grad_norm": 1.2359856367111206, "learning_rate": 2.851851851851852e-05, "loss": 0.807, "step": 14600 }, { "epoch": 7.723630631814002, "grad_norm": 1.2495603561401367, "learning_rate": 2.8333333333333335e-05, "loss": 0.8035, "step": 14700 }, { "epoch": 7.776172336792329, "grad_norm": 1.0622730255126953, "learning_rate": 2.814814814814815e-05, "loss": 0.8199, "step": 14800 }, { "epoch": 7.828714041770655, "grad_norm": 1.097486138343811, "learning_rate": 2.7962962962962965e-05, "loss": 0.8107, "step": 14900 }, { "epoch": 7.881255746748982, "grad_norm": 1.1956055164337158, "learning_rate": 2.777777777777778e-05, "loss": 0.7962, "step": 15000 }, { "epoch": 7.881255746748982, "eval_loss": 0.8224829435348511, "eval_runtime": 1.2586, "eval_samples_per_second": 7.946, "eval_steps_per_second": 3.973, "step": 15000 }, { "epoch": 7.933797451727308, "grad_norm": 1.154567837715149, "learning_rate": 2.7592592592592594e-05, "loss": 0.8278, "step": 15100 }, { "epoch": 7.986339156705635, "grad_norm": 1.4666606187820435, "learning_rate": 2.7407407407407408e-05, "loss": 0.8157, "step": 15200 }, { "epoch": 8.038880861683962, "grad_norm": 1.115820050239563, "learning_rate": 2.7222222222222223e-05, "loss": 0.7899, "step": 15300 }, { "epoch": 8.091422566662288, "grad_norm": 1.4497051239013672, "learning_rate": 2.7037037037037037e-05, "loss": 0.7809, "step": 15400 }, { "epoch": 8.143964271640614, "grad_norm": 1.9520905017852783, "learning_rate": 2.6851851851851855e-05, "loss": 0.7972, "step": 15500 }, { "epoch": 8.143964271640614, "eval_loss": 0.8295345306396484, "eval_runtime": 1.2468, "eval_samples_per_second": 8.02, "eval_steps_per_second": 4.01, "step": 15500 }, { "epoch": 8.19650597661894, "grad_norm": 1.1650198698043823, "learning_rate": 2.6666666666666667e-05, "loss": 0.7869, "step": 15600 }, { "epoch": 8.249047681597268, "grad_norm": 0.9997352957725525, "learning_rate": 2.6481481481481485e-05, "loss": 0.7786, "step": 15700 }, { "epoch": 8.301589386575595, "grad_norm": 1.173376202583313, "learning_rate": 2.6296296296296296e-05, "loss": 0.7903, "step": 15800 }, { "epoch": 8.35413109155392, "grad_norm": 1.3240666389465332, "learning_rate": 2.6111111111111114e-05, "loss": 0.7874, "step": 15900 }, { "epoch": 8.406672796532247, "grad_norm": 1.077017903327942, "learning_rate": 2.5925925925925925e-05, "loss": 0.7835, "step": 16000 }, { "epoch": 8.406672796532247, "eval_loss": 0.8272402882575989, "eval_runtime": 1.2976, "eval_samples_per_second": 7.706, "eval_steps_per_second": 3.853, "step": 16000 }, { "epoch": 8.459214501510575, "grad_norm": 1.0706350803375244, "learning_rate": 2.5740740740740743e-05, "loss": 0.8083, "step": 16100 }, { "epoch": 8.5117562064889, "grad_norm": 1.0505809783935547, "learning_rate": 2.5555555555555554e-05, "loss": 0.8085, "step": 16200 }, { "epoch": 8.564297911467227, "grad_norm": 1.0813027620315552, "learning_rate": 2.5370370370370372e-05, "loss": 0.7943, "step": 16300 }, { "epoch": 8.616839616445553, "grad_norm": 1.4488904476165771, "learning_rate": 2.5185185185185183e-05, "loss": 0.7885, "step": 16400 }, { "epoch": 8.66938132142388, "grad_norm": 1.1785513162612915, "learning_rate": 2.5e-05, "loss": 0.7867, "step": 16500 }, { "epoch": 8.66938132142388, "eval_loss": 0.8290830850601196, "eval_runtime": 1.2989, "eval_samples_per_second": 7.699, "eval_steps_per_second": 3.849, "step": 16500 }, { "epoch": 8.721923026402207, "grad_norm": 1.331755518913269, "learning_rate": 2.4814814814814816e-05, "loss": 0.7887, "step": 16600 }, { "epoch": 8.774464731380533, "grad_norm": 1.0079772472381592, "learning_rate": 2.462962962962963e-05, "loss": 0.7831, "step": 16700 }, { "epoch": 8.82700643635886, "grad_norm": 1.6257929801940918, "learning_rate": 2.4444444444444445e-05, "loss": 0.788, "step": 16800 }, { "epoch": 8.879548141337187, "grad_norm": 1.5751090049743652, "learning_rate": 2.425925925925926e-05, "loss": 0.7864, "step": 16900 }, { "epoch": 8.932089846315513, "grad_norm": 1.0488544702529907, "learning_rate": 2.4074074074074074e-05, "loss": 0.8083, "step": 17000 }, { "epoch": 8.932089846315513, "eval_loss": 0.8188499212265015, "eval_runtime": 1.2935, "eval_samples_per_second": 7.731, "eval_steps_per_second": 3.865, "step": 17000 }, { "epoch": 8.98463155129384, "grad_norm": 1.8487260341644287, "learning_rate": 2.3888888888888892e-05, "loss": 0.7956, "step": 17100 }, { "epoch": 9.037173256272165, "grad_norm": 1.395914912223816, "learning_rate": 2.3703703703703707e-05, "loss": 0.7921, "step": 17200 }, { "epoch": 9.089714961250493, "grad_norm": 1.341793417930603, "learning_rate": 2.351851851851852e-05, "loss": 0.7975, "step": 17300 }, { "epoch": 9.14225666622882, "grad_norm": 1.0076344013214111, "learning_rate": 2.3333333333333336e-05, "loss": 0.773, "step": 17400 }, { "epoch": 9.194798371207145, "grad_norm": 1.081777811050415, "learning_rate": 2.314814814814815e-05, "loss": 0.7654, "step": 17500 }, { "epoch": 9.194798371207145, "eval_loss": 0.8274938464164734, "eval_runtime": 1.2836, "eval_samples_per_second": 7.79, "eval_steps_per_second": 3.895, "step": 17500 }, { "epoch": 9.247340076185472, "grad_norm": 1.1430171728134155, "learning_rate": 2.2962962962962965e-05, "loss": 0.7956, "step": 17600 }, { "epoch": 9.2998817811638, "grad_norm": 1.0450961589813232, "learning_rate": 2.277777777777778e-05, "loss": 0.7774, "step": 17700 }, { "epoch": 9.352423486142126, "grad_norm": 1.0086592435836792, "learning_rate": 2.2592592592592594e-05, "loss": 0.7739, "step": 17800 }, { "epoch": 9.404965191120452, "grad_norm": 1.7970887422561646, "learning_rate": 2.240740740740741e-05, "loss": 0.7793, "step": 17900 }, { "epoch": 9.457506896098778, "grad_norm": 1.5623388290405273, "learning_rate": 2.2222222222222223e-05, "loss": 0.7793, "step": 18000 }, { "epoch": 9.457506896098778, "eval_loss": 0.8288339376449585, "eval_runtime": 1.2843, "eval_samples_per_second": 7.786, "eval_steps_per_second": 3.893, "step": 18000 }, { "epoch": 9.510048601077106, "grad_norm": 1.1271198987960815, "learning_rate": 2.2037037037037038e-05, "loss": 0.7693, "step": 18100 }, { "epoch": 9.562590306055432, "grad_norm": 1.0703665018081665, "learning_rate": 2.1851851851851852e-05, "loss": 0.7734, "step": 18200 }, { "epoch": 9.615132011033758, "grad_norm": 1.5558685064315796, "learning_rate": 2.1666666666666667e-05, "loss": 0.7778, "step": 18300 }, { "epoch": 9.667673716012084, "grad_norm": 1.932632565498352, "learning_rate": 2.148148148148148e-05, "loss": 0.7842, "step": 18400 }, { "epoch": 9.720215420990412, "grad_norm": 2.2928659915924072, "learning_rate": 2.1296296296296296e-05, "loss": 0.7761, "step": 18500 }, { "epoch": 9.720215420990412, "eval_loss": 0.8304858207702637, "eval_runtime": 1.2431, "eval_samples_per_second": 8.045, "eval_steps_per_second": 4.022, "step": 18500 }, { "epoch": 9.772757125968738, "grad_norm": 1.588517189025879, "learning_rate": 2.111111111111111e-05, "loss": 0.7738, "step": 18600 }, { "epoch": 9.825298830947064, "grad_norm": 1.4118093252182007, "learning_rate": 2.0925925925925925e-05, "loss": 0.7852, "step": 18700 }, { "epoch": 9.87784053592539, "grad_norm": 1.7786134481430054, "learning_rate": 2.074074074074074e-05, "loss": 0.789, "step": 18800 }, { "epoch": 9.930382240903718, "grad_norm": 1.4427546262741089, "learning_rate": 2.0555555555555555e-05, "loss": 0.8028, "step": 18900 }, { "epoch": 9.982923945882044, "grad_norm": 1.2574381828308105, "learning_rate": 2.037037037037037e-05, "loss": 0.7694, "step": 19000 }, { "epoch": 9.982923945882044, "eval_loss": 0.8281936645507812, "eval_runtime": 1.2455, "eval_samples_per_second": 8.029, "eval_steps_per_second": 4.015, "step": 19000 }, { "epoch": 10.03546565086037, "grad_norm": 1.5743870735168457, "learning_rate": 2.0185185185185187e-05, "loss": 0.7619, "step": 19100 }, { "epoch": 10.088007355838696, "grad_norm": 1.350224494934082, "learning_rate": 2e-05, "loss": 0.7596, "step": 19200 }, { "epoch": 10.140549060817024, "grad_norm": 1.1520729064941406, "learning_rate": 1.9814814814814816e-05, "loss": 0.7561, "step": 19300 }, { "epoch": 10.19309076579535, "grad_norm": 1.2705488204956055, "learning_rate": 1.962962962962963e-05, "loss": 0.7659, "step": 19400 }, { "epoch": 10.245632470773677, "grad_norm": 1.1752071380615234, "learning_rate": 1.9444444444444445e-05, "loss": 0.7597, "step": 19500 }, { "epoch": 10.245632470773677, "eval_loss": 0.8358650207519531, "eval_runtime": 1.2389, "eval_samples_per_second": 8.072, "eval_steps_per_second": 4.036, "step": 19500 }, { "epoch": 10.298174175752003, "grad_norm": 1.3173915147781372, "learning_rate": 1.925925925925926e-05, "loss": 0.7787, "step": 19600 }, { "epoch": 10.35071588073033, "grad_norm": 1.3181281089782715, "learning_rate": 1.9074074074074075e-05, "loss": 0.7559, "step": 19700 }, { "epoch": 10.403257585708657, "grad_norm": 1.1058646440505981, "learning_rate": 1.888888888888889e-05, "loss": 0.7659, "step": 19800 }, { "epoch": 10.455799290686983, "grad_norm": 1.2898966073989868, "learning_rate": 1.8703703703703704e-05, "loss": 0.7642, "step": 19900 }, { "epoch": 10.508340995665309, "grad_norm": 1.7255585193634033, "learning_rate": 1.8518518518518518e-05, "loss": 0.7659, "step": 20000 }, { "epoch": 10.508340995665309, "eval_loss": 0.8337475061416626, "eval_runtime": 1.2724, "eval_samples_per_second": 7.859, "eval_steps_per_second": 3.93, "step": 20000 }, { "epoch": 10.560882700643635, "grad_norm": 1.101854681968689, "learning_rate": 1.8333333333333333e-05, "loss": 0.7754, "step": 20100 }, { "epoch": 10.613424405621963, "grad_norm": 1.6675972938537598, "learning_rate": 1.814814814814815e-05, "loss": 0.7597, "step": 20200 }, { "epoch": 10.665966110600289, "grad_norm": 1.428888201713562, "learning_rate": 1.7962962962962965e-05, "loss": 0.7714, "step": 20300 }, { "epoch": 10.718507815578615, "grad_norm": 1.2556451559066772, "learning_rate": 1.777777777777778e-05, "loss": 0.7628, "step": 20400 }, { "epoch": 10.771049520556943, "grad_norm": 1.817168951034546, "learning_rate": 1.7592592592592595e-05, "loss": 0.7652, "step": 20500 }, { "epoch": 10.771049520556943, "eval_loss": 0.8289246559143066, "eval_runtime": 1.2657, "eval_samples_per_second": 7.901, "eval_steps_per_second": 3.95, "step": 20500 }, { "epoch": 10.823591225535269, "grad_norm": 0.9452287554740906, "learning_rate": 1.740740740740741e-05, "loss": 0.7661, "step": 20600 }, { "epoch": 10.876132930513595, "grad_norm": 1.0433977842330933, "learning_rate": 1.7222222222222224e-05, "loss": 0.7662, "step": 20700 }, { "epoch": 10.928674635491921, "grad_norm": 1.7316306829452515, "learning_rate": 1.7037037037037038e-05, "loss": 0.7694, "step": 20800 }, { "epoch": 10.981216340470247, "grad_norm": 1.1855412721633911, "learning_rate": 1.6851851851851853e-05, "loss": 0.7608, "step": 20900 }, { "epoch": 11.033758045448575, "grad_norm": 1.2221699953079224, "learning_rate": 1.6666666666666667e-05, "loss": 0.7538, "step": 21000 }, { "epoch": 11.033758045448575, "eval_loss": 0.8385075330734253, "eval_runtime": 1.2472, "eval_samples_per_second": 8.018, "eval_steps_per_second": 4.009, "step": 21000 }, { "epoch": 11.086299750426901, "grad_norm": 1.1642967462539673, "learning_rate": 1.6481481481481482e-05, "loss": 0.7348, "step": 21100 }, { "epoch": 11.138841455405228, "grad_norm": 1.7363336086273193, "learning_rate": 1.62962962962963e-05, "loss": 0.756, "step": 21200 }, { "epoch": 11.191383160383554, "grad_norm": 1.3126277923583984, "learning_rate": 1.6111111111111115e-05, "loss": 0.7515, "step": 21300 }, { "epoch": 11.243924865361882, "grad_norm": 2.1372787952423096, "learning_rate": 1.5925925925925926e-05, "loss": 0.7487, "step": 21400 }, { "epoch": 11.296466570340208, "grad_norm": 1.5635650157928467, "learning_rate": 1.574074074074074e-05, "loss": 0.7554, "step": 21500 }, { "epoch": 11.296466570340208, "eval_loss": 0.8373192548751831, "eval_runtime": 1.2681, "eval_samples_per_second": 7.886, "eval_steps_per_second": 3.943, "step": 21500 }, { "epoch": 11.349008275318534, "grad_norm": 1.708914875984192, "learning_rate": 1.5555555555555555e-05, "loss": 0.7648, "step": 21600 }, { "epoch": 11.40154998029686, "grad_norm": 1.1601842641830444, "learning_rate": 1.537037037037037e-05, "loss": 0.7614, "step": 21700 }, { "epoch": 11.454091685275188, "grad_norm": 1.1674526929855347, "learning_rate": 1.5185185185185186e-05, "loss": 0.7653, "step": 21800 }, { "epoch": 11.506633390253514, "grad_norm": 1.2474111318588257, "learning_rate": 1.5e-05, "loss": 0.7596, "step": 21900 }, { "epoch": 11.55917509523184, "grad_norm": 1.2556248903274536, "learning_rate": 1.4814814814814815e-05, "loss": 0.7445, "step": 22000 }, { "epoch": 11.55917509523184, "eval_loss": 0.8322497606277466, "eval_runtime": 1.2428, "eval_samples_per_second": 8.046, "eval_steps_per_second": 4.023, "step": 22000 }, { "epoch": 11.611716800210166, "grad_norm": 1.5506947040557861, "learning_rate": 1.462962962962963e-05, "loss": 0.7449, "step": 22100 }, { "epoch": 11.664258505188494, "grad_norm": 1.124089241027832, "learning_rate": 1.4444444444444444e-05, "loss": 0.7613, "step": 22200 }, { "epoch": 11.71680021016682, "grad_norm": 1.5663940906524658, "learning_rate": 1.425925925925926e-05, "loss": 0.7447, "step": 22300 }, { "epoch": 11.769341915145146, "grad_norm": 1.4085638523101807, "learning_rate": 1.4074074074074075e-05, "loss": 0.745, "step": 22400 }, { "epoch": 11.821883620123472, "grad_norm": 1.8693653345108032, "learning_rate": 1.388888888888889e-05, "loss": 0.7615, "step": 22500 }, { "epoch": 11.821883620123472, "eval_loss": 0.8351184129714966, "eval_runtime": 1.2433, "eval_samples_per_second": 8.043, "eval_steps_per_second": 4.022, "step": 22500 }, { "epoch": 11.8744253251018, "grad_norm": 1.807320237159729, "learning_rate": 1.3703703703703704e-05, "loss": 0.7605, "step": 22600 }, { "epoch": 11.926967030080126, "grad_norm": 1.1300692558288574, "learning_rate": 1.3518518518518519e-05, "loss": 0.7604, "step": 22700 }, { "epoch": 11.979508735058452, "grad_norm": 1.5714143514633179, "learning_rate": 1.3333333333333333e-05, "loss": 0.7462, "step": 22800 }, { "epoch": 12.032050440036778, "grad_norm": 1.1730937957763672, "learning_rate": 1.3148148148148148e-05, "loss": 0.7655, "step": 22900 }, { "epoch": 12.084592145015106, "grad_norm": 1.1946797370910645, "learning_rate": 1.2962962962962962e-05, "loss": 0.7326, "step": 23000 }, { "epoch": 12.084592145015106, "eval_loss": 0.8375317454338074, "eval_runtime": 1.2867, "eval_samples_per_second": 7.772, "eval_steps_per_second": 3.886, "step": 23000 }, { "epoch": 12.137133849993432, "grad_norm": 1.1328163146972656, "learning_rate": 1.2777777777777777e-05, "loss": 0.7439, "step": 23100 }, { "epoch": 12.189675554971759, "grad_norm": 1.026856780052185, "learning_rate": 1.2592592592592592e-05, "loss": 0.7589, "step": 23200 }, { "epoch": 12.242217259950085, "grad_norm": 1.920116662979126, "learning_rate": 1.2407407407407408e-05, "loss": 0.7458, "step": 23300 }, { "epoch": 12.294758964928413, "grad_norm": 1.3238016366958618, "learning_rate": 1.2222222222222222e-05, "loss": 0.74, "step": 23400 }, { "epoch": 12.347300669906739, "grad_norm": 1.192124843597412, "learning_rate": 1.2037037037037037e-05, "loss": 0.7369, "step": 23500 }, { "epoch": 12.347300669906739, "eval_loss": 0.8405577540397644, "eval_runtime": 1.2401, "eval_samples_per_second": 8.064, "eval_steps_per_second": 4.032, "step": 23500 }, { "epoch": 12.399842374885065, "grad_norm": 1.3931485414505005, "learning_rate": 1.1851851851851853e-05, "loss": 0.7619, "step": 23600 }, { "epoch": 12.452384079863391, "grad_norm": 1.071673035621643, "learning_rate": 1.1666666666666668e-05, "loss": 0.753, "step": 23700 }, { "epoch": 12.504925784841719, "grad_norm": 1.2407554388046265, "learning_rate": 1.1481481481481482e-05, "loss": 0.7382, "step": 23800 }, { "epoch": 12.557467489820045, "grad_norm": 1.2360953092575073, "learning_rate": 1.1296296296296297e-05, "loss": 0.7348, "step": 23900 }, { "epoch": 12.610009194798371, "grad_norm": 1.2687872648239136, "learning_rate": 1.1111111111111112e-05, "loss": 0.75, "step": 24000 }, { "epoch": 12.610009194798371, "eval_loss": 0.8404332399368286, "eval_runtime": 1.2513, "eval_samples_per_second": 7.992, "eval_steps_per_second": 3.996, "step": 24000 }, { "epoch": 12.662550899776697, "grad_norm": 1.1866815090179443, "learning_rate": 1.0925925925925926e-05, "loss": 0.7419, "step": 24100 }, { "epoch": 12.715092604755025, "grad_norm": 1.715956687927246, "learning_rate": 1.074074074074074e-05, "loss": 0.7406, "step": 24200 }, { "epoch": 12.767634309733351, "grad_norm": 1.1529712677001953, "learning_rate": 1.0555555555555555e-05, "loss": 0.7516, "step": 24300 }, { "epoch": 12.820176014711677, "grad_norm": 1.2465269565582275, "learning_rate": 1.037037037037037e-05, "loss": 0.7407, "step": 24400 }, { "epoch": 12.872717719690003, "grad_norm": 1.1018378734588623, "learning_rate": 1.0185185185185185e-05, "loss": 0.7514, "step": 24500 }, { "epoch": 12.872717719690003, "eval_loss": 0.8341609835624695, "eval_runtime": 1.2427, "eval_samples_per_second": 8.047, "eval_steps_per_second": 4.024, "step": 24500 }, { "epoch": 12.925259424668331, "grad_norm": 1.2097246646881104, "learning_rate": 1e-05, "loss": 0.7298, "step": 24600 }, { "epoch": 12.977801129646657, "grad_norm": 1.5760364532470703, "learning_rate": 9.814814814814815e-06, "loss": 0.7538, "step": 24700 }, { "epoch": 13.030342834624983, "grad_norm": 1.2670038938522339, "learning_rate": 9.62962962962963e-06, "loss": 0.7437, "step": 24800 }, { "epoch": 13.08288453960331, "grad_norm": 1.178148627281189, "learning_rate": 9.444444444444445e-06, "loss": 0.7392, "step": 24900 }, { "epoch": 13.135426244581637, "grad_norm": 1.2803364992141724, "learning_rate": 9.259259259259259e-06, "loss": 0.7178, "step": 25000 }, { "epoch": 13.135426244581637, "eval_loss": 0.8406384587287903, "eval_runtime": 1.2436, "eval_samples_per_second": 8.041, "eval_steps_per_second": 4.02, "step": 25000 }, { "epoch": 13.187967949559964, "grad_norm": 1.3507287502288818, "learning_rate": 9.074074074074075e-06, "loss": 0.7473, "step": 25100 }, { "epoch": 13.24050965453829, "grad_norm": 1.1901633739471436, "learning_rate": 8.88888888888889e-06, "loss": 0.7386, "step": 25200 }, { "epoch": 13.293051359516616, "grad_norm": 1.25368332862854, "learning_rate": 8.703703703703705e-06, "loss": 0.7426, "step": 25300 }, { "epoch": 13.345593064494944, "grad_norm": 1.4581819772720337, "learning_rate": 8.518518518518519e-06, "loss": 0.7479, "step": 25400 }, { "epoch": 13.39813476947327, "grad_norm": 1.201744556427002, "learning_rate": 8.333333333333334e-06, "loss": 0.7266, "step": 25500 }, { "epoch": 13.39813476947327, "eval_loss": 0.8411173820495605, "eval_runtime": 1.2635, "eval_samples_per_second": 7.915, "eval_steps_per_second": 3.957, "step": 25500 }, { "epoch": 13.450676474451596, "grad_norm": 1.3724862337112427, "learning_rate": 8.14814814814815e-06, "loss": 0.7392, "step": 25600 }, { "epoch": 13.503218179429922, "grad_norm": 1.2667688131332397, "learning_rate": 7.962962962962963e-06, "loss": 0.731, "step": 25700 }, { "epoch": 13.55575988440825, "grad_norm": 1.3549320697784424, "learning_rate": 7.777777777777777e-06, "loss": 0.7307, "step": 25800 }, { "epoch": 13.608301589386576, "grad_norm": 1.3319281339645386, "learning_rate": 7.592592592592593e-06, "loss": 0.7315, "step": 25900 }, { "epoch": 13.660843294364902, "grad_norm": 1.3759852647781372, "learning_rate": 7.4074074074074075e-06, "loss": 0.7338, "step": 26000 }, { "epoch": 13.660843294364902, "eval_loss": 0.8397306203842163, "eval_runtime": 1.2452, "eval_samples_per_second": 8.031, "eval_steps_per_second": 4.015, "step": 26000 }, { "epoch": 13.713384999343228, "grad_norm": 1.452658772468567, "learning_rate": 7.222222222222222e-06, "loss": 0.7339, "step": 26100 }, { "epoch": 13.765926704321554, "grad_norm": 1.1617493629455566, "learning_rate": 7.0370370370370375e-06, "loss": 0.737, "step": 26200 }, { "epoch": 13.818468409299882, "grad_norm": 1.628431797027588, "learning_rate": 6.851851851851852e-06, "loss": 0.7437, "step": 26300 }, { "epoch": 13.871010114278208, "grad_norm": 1.4544808864593506, "learning_rate": 6.666666666666667e-06, "loss": 0.7411, "step": 26400 }, { "epoch": 13.923551819256534, "grad_norm": 1.7027857303619385, "learning_rate": 6.481481481481481e-06, "loss": 0.7432, "step": 26500 }, { "epoch": 13.923551819256534, "eval_loss": 0.8381926417350769, "eval_runtime": 1.2458, "eval_samples_per_second": 8.027, "eval_steps_per_second": 4.013, "step": 26500 }, { "epoch": 13.976093524234862, "grad_norm": 1.320162296295166, "learning_rate": 6.296296296296296e-06, "loss": 0.7463, "step": 26600 }, { "epoch": 14.028635229213188, "grad_norm": 1.2044928073883057, "learning_rate": 6.111111111111111e-06, "loss": 0.7358, "step": 26700 }, { "epoch": 14.081176934191515, "grad_norm": 1.0798685550689697, "learning_rate": 5.925925925925927e-06, "loss": 0.7416, "step": 26800 }, { "epoch": 14.13371863916984, "grad_norm": 1.4219785928726196, "learning_rate": 5.740740740740741e-06, "loss": 0.7256, "step": 26900 }, { "epoch": 14.186260344148167, "grad_norm": 1.1347541809082031, "learning_rate": 5.555555555555556e-06, "loss": 0.7322, "step": 27000 }, { "epoch": 14.186260344148167, "eval_loss": 0.8432482481002808, "eval_runtime": 1.2881, "eval_samples_per_second": 7.763, "eval_steps_per_second": 3.882, "step": 27000 }, { "epoch": 14.238802049126495, "grad_norm": 1.5265473127365112, "learning_rate": 5.37037037037037e-06, "loss": 0.7281, "step": 27100 }, { "epoch": 14.29134375410482, "grad_norm": 1.0720326900482178, "learning_rate": 5.185185185185185e-06, "loss": 0.731, "step": 27200 }, { "epoch": 14.343885459083147, "grad_norm": 1.5150460004806519, "learning_rate": 5e-06, "loss": 0.719, "step": 27300 }, { "epoch": 14.396427164061473, "grad_norm": 1.2847768068313599, "learning_rate": 4.814814814814815e-06, "loss": 0.7307, "step": 27400 }, { "epoch": 14.4489688690398, "grad_norm": 1.4064009189605713, "learning_rate": 4.6296296296296296e-06, "loss": 0.7293, "step": 27500 }, { "epoch": 14.4489688690398, "eval_loss": 0.8425260782241821, "eval_runtime": 1.247, "eval_samples_per_second": 8.019, "eval_steps_per_second": 4.01, "step": 27500 }, { "epoch": 14.501510574018127, "grad_norm": 1.1766741275787354, "learning_rate": 4.444444444444445e-06, "loss": 0.7406, "step": 27600 }, { "epoch": 14.554052278996453, "grad_norm": 1.3252798318862915, "learning_rate": 4.2592592592592596e-06, "loss": 0.7377, "step": 27700 }, { "epoch": 14.60659398397478, "grad_norm": 1.2687978744506836, "learning_rate": 4.074074074074075e-06, "loss": 0.7303, "step": 27800 }, { "epoch": 14.659135688953107, "grad_norm": 1.29690420627594, "learning_rate": 3.888888888888889e-06, "loss": 0.7349, "step": 27900 }, { "epoch": 14.711677393931433, "grad_norm": 1.4593523740768433, "learning_rate": 3.7037037037037037e-06, "loss": 0.7327, "step": 28000 }, { "epoch": 14.711677393931433, "eval_loss": 0.842442512512207, "eval_runtime": 1.2608, "eval_samples_per_second": 7.931, "eval_steps_per_second": 3.966, "step": 28000 }, { "epoch": 14.76421909890976, "grad_norm": 1.1667600870132446, "learning_rate": 3.5185185185185187e-06, "loss": 0.7276, "step": 28100 }, { "epoch": 14.816760803888085, "grad_norm": 1.0612125396728516, "learning_rate": 3.3333333333333333e-06, "loss": 0.7468, "step": 28200 }, { "epoch": 14.869302508866413, "grad_norm": 1.117440938949585, "learning_rate": 3.148148148148148e-06, "loss": 0.7189, "step": 28300 }, { "epoch": 14.92184421384474, "grad_norm": 1.1088405847549438, "learning_rate": 2.9629629629629633e-06, "loss": 0.732, "step": 28400 }, { "epoch": 14.974385918823065, "grad_norm": 1.381748914718628, "learning_rate": 2.777777777777778e-06, "loss": 0.7368, "step": 28500 }, { "epoch": 14.974385918823065, "eval_loss": 0.844028651714325, "eval_runtime": 1.2451, "eval_samples_per_second": 8.031, "eval_steps_per_second": 4.016, "step": 28500 }, { "epoch": 15.026927623801392, "grad_norm": 1.2738327980041504, "learning_rate": 2.5925925925925925e-06, "loss": 0.7411, "step": 28600 }, { "epoch": 15.07946932877972, "grad_norm": 1.450596809387207, "learning_rate": 2.4074074074074075e-06, "loss": 0.721, "step": 28700 }, { "epoch": 15.132011033758046, "grad_norm": 1.2589495182037354, "learning_rate": 2.2222222222222225e-06, "loss": 0.7134, "step": 28800 }, { "epoch": 15.184552738736372, "grad_norm": 1.1227418184280396, "learning_rate": 2.0370370370370375e-06, "loss": 0.7361, "step": 28900 }, { "epoch": 15.237094443714698, "grad_norm": 1.1688610315322876, "learning_rate": 1.8518518518518519e-06, "loss": 0.722, "step": 29000 }, { "epoch": 15.237094443714698, "eval_loss": 0.8448740243911743, "eval_runtime": 1.2712, "eval_samples_per_second": 7.867, "eval_steps_per_second": 3.933, "step": 29000 }, { "epoch": 15.289636148693026, "grad_norm": 1.4582873582839966, "learning_rate": 1.6666666666666667e-06, "loss": 0.7236, "step": 29100 }, { "epoch": 15.342177853671352, "grad_norm": 1.5905903577804565, "learning_rate": 1.4814814814814817e-06, "loss": 0.7332, "step": 29200 }, { "epoch": 15.394719558649678, "grad_norm": 2.0454487800598145, "learning_rate": 1.2962962962962962e-06, "loss": 0.7222, "step": 29300 }, { "epoch": 15.447261263628004, "grad_norm": 1.3177134990692139, "learning_rate": 1.1111111111111112e-06, "loss": 0.7362, "step": 29400 }, { "epoch": 15.499802968606332, "grad_norm": 1.0010744333267212, "learning_rate": 9.259259259259259e-07, "loss": 0.7304, "step": 29500 }, { "epoch": 15.499802968606332, "eval_loss": 0.84196537733078, "eval_runtime": 1.2445, "eval_samples_per_second": 8.035, "eval_steps_per_second": 4.018, "step": 29500 }, { "epoch": 15.552344673584658, "grad_norm": 1.296751618385315, "learning_rate": 7.407407407407408e-07, "loss": 0.719, "step": 29600 }, { "epoch": 15.604886378562984, "grad_norm": 1.198317050933838, "learning_rate": 5.555555555555556e-07, "loss": 0.7359, "step": 29700 }, { "epoch": 15.65742808354131, "grad_norm": 1.3005775213241577, "learning_rate": 3.703703703703704e-07, "loss": 0.7268, "step": 29800 }, { "epoch": 15.709969788519638, "grad_norm": 1.1755659580230713, "learning_rate": 1.851851851851852e-07, "loss": 0.7266, "step": 29900 }, { "epoch": 15.762511493497964, "grad_norm": 1.1558780670166016, "learning_rate": 0.0, "loss": 0.7389, "step": 30000 }, { "epoch": 15.762511493497964, "eval_loss": 0.8437487483024597, "eval_runtime": 1.263, "eval_samples_per_second": 7.918, "eval_steps_per_second": 3.959, "step": 30000 } ], "logging_steps": 100, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.658737511387988e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }