|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.762511493497964, |
|
"eval_steps": 500, |
|
"global_step": 30000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.052541704978326546, |
|
"grad_norm": 0.744549572467804, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 1.663, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10508340995665309, |
|
"grad_norm": 0.6462538838386536, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.616, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15762511493497963, |
|
"grad_norm": 0.5421179533004761, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4549, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21016681991330619, |
|
"grad_norm": 0.8848033547401428, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.3941, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2627085248916327, |
|
"grad_norm": 0.7000551819801331, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.2752, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2627085248916327, |
|
"eval_loss": 1.0633598566055298, |
|
"eval_runtime": 1.3002, |
|
"eval_samples_per_second": 7.691, |
|
"eval_steps_per_second": 3.845, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.31525022986995926, |
|
"grad_norm": 0.8002354502677917, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2205, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3677919348482858, |
|
"grad_norm": 0.7082624435424805, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.188, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.42033363982661237, |
|
"grad_norm": 0.626582682132721, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.1647, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4728753448049389, |
|
"grad_norm": 1.0509939193725586, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.1427, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5254170497832654, |
|
"grad_norm": 1.1202670335769653, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.144, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5254170497832654, |
|
"eval_loss": 0.9546993374824524, |
|
"eval_runtime": 1.2391, |
|
"eval_samples_per_second": 8.07, |
|
"eval_steps_per_second": 4.035, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.577958754761592, |
|
"grad_norm": 1.1231364011764526, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 1.0935, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6305004597399185, |
|
"grad_norm": 1.0733870267868042, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1038, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6830421647182451, |
|
"grad_norm": 1.0306121110916138, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 1.1055, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7355838696965716, |
|
"grad_norm": 1.2776762247085571, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 1.0724, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7881255746748982, |
|
"grad_norm": 1.0678656101226807, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.0564, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7881255746748982, |
|
"eval_loss": 0.912558913230896, |
|
"eval_runtime": 1.2576, |
|
"eval_samples_per_second": 7.952, |
|
"eval_steps_per_second": 3.976, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8406672796532247, |
|
"grad_norm": 1.6063746213912964, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 1.0391, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8932089846315513, |
|
"grad_norm": 1.367933750152588, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 1.0435, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9457506896098778, |
|
"grad_norm": 1.0610206127166748, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0436, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9982923945882044, |
|
"grad_norm": 0.8625739216804504, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 1.0334, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0508340995665308, |
|
"grad_norm": 1.4904906749725342, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.0228, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0508340995665308, |
|
"eval_loss": 0.8912132978439331, |
|
"eval_runtime": 1.2665, |
|
"eval_samples_per_second": 7.896, |
|
"eval_steps_per_second": 3.948, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1033758045448574, |
|
"grad_norm": 1.4299525022506714, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.0009, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.155917509523184, |
|
"grad_norm": 1.2776519060134888, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.9988, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2084592145015105, |
|
"grad_norm": 1.1559321880340576, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 1.0003, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.261000919479837, |
|
"grad_norm": 0.9451723098754883, |
|
"learning_rate": 4e-05, |
|
"loss": 0.9961, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3135426244581636, |
|
"grad_norm": 1.4040967226028442, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.9981, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3135426244581636, |
|
"eval_loss": 0.8769587278366089, |
|
"eval_runtime": 1.2527, |
|
"eval_samples_per_second": 7.983, |
|
"eval_steps_per_second": 3.991, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3660843294364902, |
|
"grad_norm": 1.469843864440918, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.9868, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4186260344148167, |
|
"grad_norm": 1.1479499340057373, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.9822, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.4711677393931433, |
|
"grad_norm": 2.1333274841308594, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.9921, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5237094443714698, |
|
"grad_norm": 1.162014365196228, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 0.9813, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.5762511493497964, |
|
"grad_norm": 0.9883254170417786, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9796, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5762511493497964, |
|
"eval_loss": 0.8728264570236206, |
|
"eval_runtime": 1.262, |
|
"eval_samples_per_second": 7.924, |
|
"eval_steps_per_second": 3.962, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.628792854328123, |
|
"grad_norm": 1.1366589069366455, |
|
"learning_rate": 4.981481481481482e-05, |
|
"loss": 0.985, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.6813345593064495, |
|
"grad_norm": 1.0342519283294678, |
|
"learning_rate": 4.962962962962963e-05, |
|
"loss": 0.9872, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.733876264284776, |
|
"grad_norm": 0.9889224767684937, |
|
"learning_rate": 4.9444444444444446e-05, |
|
"loss": 0.9892, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.7864179692631026, |
|
"grad_norm": 0.9875237345695496, |
|
"learning_rate": 4.925925925925926e-05, |
|
"loss": 0.9606, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.8389596742414291, |
|
"grad_norm": 0.9079448580741882, |
|
"learning_rate": 4.9074074074074075e-05, |
|
"loss": 0.9846, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8389596742414291, |
|
"eval_loss": 0.869242787361145, |
|
"eval_runtime": 1.2422, |
|
"eval_samples_per_second": 8.05, |
|
"eval_steps_per_second": 4.025, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8915013792197557, |
|
"grad_norm": 1.0478876829147339, |
|
"learning_rate": 4.888888888888889e-05, |
|
"loss": 0.9616, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.9440430841980822, |
|
"grad_norm": 1.0536926984786987, |
|
"learning_rate": 4.8703703703703704e-05, |
|
"loss": 0.9674, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.9965847891764088, |
|
"grad_norm": 1.1787534952163696, |
|
"learning_rate": 4.851851851851852e-05, |
|
"loss": 0.9567, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.049126494154735, |
|
"grad_norm": 1.0072757005691528, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 0.9441, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.1016681991330617, |
|
"grad_norm": 1.2437305450439453, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.9316, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1016681991330617, |
|
"eval_loss": 0.8525739908218384, |
|
"eval_runtime": 1.2391, |
|
"eval_samples_per_second": 8.07, |
|
"eval_steps_per_second": 4.035, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1542099041113882, |
|
"grad_norm": 1.722901701927185, |
|
"learning_rate": 4.796296296296296e-05, |
|
"loss": 0.934, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.206751609089715, |
|
"grad_norm": 1.032166600227356, |
|
"learning_rate": 4.7777777777777784e-05, |
|
"loss": 0.9324, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.2592933140680413, |
|
"grad_norm": 1.0329689979553223, |
|
"learning_rate": 4.759259259259259e-05, |
|
"loss": 0.9327, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.311835019046368, |
|
"grad_norm": 0.8714644908905029, |
|
"learning_rate": 4.740740740740741e-05, |
|
"loss": 0.9228, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.3643767240246945, |
|
"grad_norm": 1.048893928527832, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 0.9356, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.3643767240246945, |
|
"eval_loss": 0.8477146029472351, |
|
"eval_runtime": 1.2917, |
|
"eval_samples_per_second": 7.742, |
|
"eval_steps_per_second": 3.871, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.416918429003021, |
|
"grad_norm": 1.1038933992385864, |
|
"learning_rate": 4.703703703703704e-05, |
|
"loss": 0.9317, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.4694601339813476, |
|
"grad_norm": 0.9085045456886292, |
|
"learning_rate": 4.685185185185185e-05, |
|
"loss": 0.9411, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.522001838959674, |
|
"grad_norm": 1.1415351629257202, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.9227, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.5745435439380007, |
|
"grad_norm": 0.9230244755744934, |
|
"learning_rate": 4.648148148148148e-05, |
|
"loss": 0.9362, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.627085248916327, |
|
"grad_norm": 0.9721058011054993, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.9297, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.627085248916327, |
|
"eval_loss": 0.8504465818405151, |
|
"eval_runtime": 1.2834, |
|
"eval_samples_per_second": 7.792, |
|
"eval_steps_per_second": 3.896, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6796269538946538, |
|
"grad_norm": 0.933588981628418, |
|
"learning_rate": 4.6111111111111115e-05, |
|
"loss": 0.9339, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.7321686588729803, |
|
"grad_norm": 0.915013313293457, |
|
"learning_rate": 4.592592592592593e-05, |
|
"loss": 0.9322, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.784710363851307, |
|
"grad_norm": 1.2846601009368896, |
|
"learning_rate": 4.5740740740740745e-05, |
|
"loss": 0.913, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.8372520688296334, |
|
"grad_norm": 1.1330525875091553, |
|
"learning_rate": 4.555555555555556e-05, |
|
"loss": 0.9241, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.88979377380796, |
|
"grad_norm": 1.6967169046401978, |
|
"learning_rate": 4.5370370370370374e-05, |
|
"loss": 0.9127, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.88979377380796, |
|
"eval_loss": 0.8335689306259155, |
|
"eval_runtime": 1.2838, |
|
"eval_samples_per_second": 7.789, |
|
"eval_steps_per_second": 3.895, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.9423354787862865, |
|
"grad_norm": 0.9576010704040527, |
|
"learning_rate": 4.518518518518519e-05, |
|
"loss": 0.9097, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.994877183764613, |
|
"grad_norm": 1.229671835899353, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.9165, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.0474188887429396, |
|
"grad_norm": 0.9229328632354736, |
|
"learning_rate": 4.481481481481482e-05, |
|
"loss": 0.9066, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.099960593721266, |
|
"grad_norm": 1.1152088642120361, |
|
"learning_rate": 4.462962962962963e-05, |
|
"loss": 0.8888, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.1525022986995928, |
|
"grad_norm": 1.0948377847671509, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.9233, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.1525022986995928, |
|
"eval_loss": 0.8405929803848267, |
|
"eval_runtime": 1.2669, |
|
"eval_samples_per_second": 7.893, |
|
"eval_steps_per_second": 3.947, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.2050440036779193, |
|
"grad_norm": 1.320657730102539, |
|
"learning_rate": 4.425925925925926e-05, |
|
"loss": 0.9024, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.257585708656246, |
|
"grad_norm": 1.3515260219573975, |
|
"learning_rate": 4.4074074074074076e-05, |
|
"loss": 0.8992, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.3101274136345724, |
|
"grad_norm": 0.9061586260795593, |
|
"learning_rate": 4.388888888888889e-05, |
|
"loss": 0.8966, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.362669118612899, |
|
"grad_norm": 0.7893356084823608, |
|
"learning_rate": 4.3703703703703705e-05, |
|
"loss": 0.8772, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.4152108235912255, |
|
"grad_norm": 1.1485052108764648, |
|
"learning_rate": 4.351851851851852e-05, |
|
"loss": 0.8935, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.4152108235912255, |
|
"eval_loss": 0.8316594362258911, |
|
"eval_runtime": 1.2559, |
|
"eval_samples_per_second": 7.962, |
|
"eval_steps_per_second": 3.981, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.467752528569552, |
|
"grad_norm": 1.2252397537231445, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.8877, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.5202942335478786, |
|
"grad_norm": 0.937276303768158, |
|
"learning_rate": 4.314814814814815e-05, |
|
"loss": 0.899, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.572835938526205, |
|
"grad_norm": 1.1383343935012817, |
|
"learning_rate": 4.296296296296296e-05, |
|
"loss": 0.884, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.6253776435045317, |
|
"grad_norm": 0.9624786376953125, |
|
"learning_rate": 4.277777777777778e-05, |
|
"loss": 0.8933, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.6779193484828583, |
|
"grad_norm": 1.2297359704971313, |
|
"learning_rate": 4.259259259259259e-05, |
|
"loss": 0.8902, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.6779193484828583, |
|
"eval_loss": 0.8357389569282532, |
|
"eval_runtime": 1.2564, |
|
"eval_samples_per_second": 7.959, |
|
"eval_steps_per_second": 3.979, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.730461053461185, |
|
"grad_norm": 1.3117122650146484, |
|
"learning_rate": 4.240740740740741e-05, |
|
"loss": 0.8833, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.7830027584395114, |
|
"grad_norm": 1.0405583381652832, |
|
"learning_rate": 4.222222222222222e-05, |
|
"loss": 0.8974, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.835544463417838, |
|
"grad_norm": 1.1837128400802612, |
|
"learning_rate": 4.203703703703704e-05, |
|
"loss": 0.9108, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.8880861683961645, |
|
"grad_norm": 1.070272445678711, |
|
"learning_rate": 4.185185185185185e-05, |
|
"loss": 0.888, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.940627873374491, |
|
"grad_norm": 1.2699558734893799, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.8996, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.940627873374491, |
|
"eval_loss": 0.8259842991828918, |
|
"eval_runtime": 1.2588, |
|
"eval_samples_per_second": 7.944, |
|
"eval_steps_per_second": 3.972, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.9931695783528176, |
|
"grad_norm": 1.1410374641418457, |
|
"learning_rate": 4.148148148148148e-05, |
|
"loss": 0.8931, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.045711283331144, |
|
"grad_norm": 0.8494368195533752, |
|
"learning_rate": 4.12962962962963e-05, |
|
"loss": 0.8735, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.09825298830947, |
|
"grad_norm": 1.6662884950637817, |
|
"learning_rate": 4.111111111111111e-05, |
|
"loss": 0.8706, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.150794693287797, |
|
"grad_norm": 1.103721022605896, |
|
"learning_rate": 4.092592592592593e-05, |
|
"loss": 0.8868, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 4.203336398266123, |
|
"grad_norm": 1.0457319021224976, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.8616, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.203336398266123, |
|
"eval_loss": 0.8296699523925781, |
|
"eval_runtime": 1.2594, |
|
"eval_samples_per_second": 7.941, |
|
"eval_steps_per_second": 3.97, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.25587810324445, |
|
"grad_norm": 1.038172721862793, |
|
"learning_rate": 4.055555555555556e-05, |
|
"loss": 0.8814, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 4.3084198082227765, |
|
"grad_norm": 1.0770879983901978, |
|
"learning_rate": 4.0370370370370374e-05, |
|
"loss": 0.8781, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 4.3609615132011035, |
|
"grad_norm": 0.9559742212295532, |
|
"learning_rate": 4.018518518518519e-05, |
|
"loss": 0.8631, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 4.41350321817943, |
|
"grad_norm": 1.2352747917175293, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8619, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 4.466044923157757, |
|
"grad_norm": 1.0156300067901611, |
|
"learning_rate": 3.981481481481482e-05, |
|
"loss": 0.8589, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.466044923157757, |
|
"eval_loss": 0.8272676467895508, |
|
"eval_runtime": 1.239, |
|
"eval_samples_per_second": 8.071, |
|
"eval_steps_per_second": 4.035, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.518586628136083, |
|
"grad_norm": 1.3172954320907593, |
|
"learning_rate": 3.962962962962963e-05, |
|
"loss": 0.8782, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.57112833311441, |
|
"grad_norm": 1.0361146926879883, |
|
"learning_rate": 3.944444444444445e-05, |
|
"loss": 0.8929, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.623670038092736, |
|
"grad_norm": 1.0141929388046265, |
|
"learning_rate": 3.925925925925926e-05, |
|
"loss": 0.8701, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.676211743071063, |
|
"grad_norm": 1.4929876327514648, |
|
"learning_rate": 3.9074074074074076e-05, |
|
"loss": 0.8877, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.728753448049389, |
|
"grad_norm": 1.6893892288208008, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.8751, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.728753448049389, |
|
"eval_loss": 0.822452187538147, |
|
"eval_runtime": 1.246, |
|
"eval_samples_per_second": 8.026, |
|
"eval_steps_per_second": 4.013, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.781295153027716, |
|
"grad_norm": 0.993008017539978, |
|
"learning_rate": 3.8703703703703705e-05, |
|
"loss": 0.8689, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.833836858006042, |
|
"grad_norm": 0.8024290800094604, |
|
"learning_rate": 3.851851851851852e-05, |
|
"loss": 0.8945, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.886378562984369, |
|
"grad_norm": 0.8442456722259521, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 0.8897, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.938920267962695, |
|
"grad_norm": 1.6505470275878906, |
|
"learning_rate": 3.814814814814815e-05, |
|
"loss": 0.8481, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.991461972941022, |
|
"grad_norm": 0.9656145572662354, |
|
"learning_rate": 3.7962962962962964e-05, |
|
"loss": 0.8699, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.991461972941022, |
|
"eval_loss": 0.8228476643562317, |
|
"eval_runtime": 1.2383, |
|
"eval_samples_per_second": 8.075, |
|
"eval_steps_per_second": 4.038, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.044003677919348, |
|
"grad_norm": 1.1603432893753052, |
|
"learning_rate": 3.777777777777778e-05, |
|
"loss": 0.8731, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 5.096545382897675, |
|
"grad_norm": 0.8760092854499817, |
|
"learning_rate": 3.759259259259259e-05, |
|
"loss": 0.8268, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 5.149087087876001, |
|
"grad_norm": 1.50885009765625, |
|
"learning_rate": 3.740740740740741e-05, |
|
"loss": 0.8341, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 5.201628792854328, |
|
"grad_norm": 1.164678692817688, |
|
"learning_rate": 3.722222222222222e-05, |
|
"loss": 0.8511, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 5.254170497832654, |
|
"grad_norm": 1.2179008722305298, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.863, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.254170497832654, |
|
"eval_loss": 0.8227807283401489, |
|
"eval_runtime": 1.2397, |
|
"eval_samples_per_second": 8.067, |
|
"eval_steps_per_second": 4.033, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.306712202810981, |
|
"grad_norm": 1.1593024730682373, |
|
"learning_rate": 3.685185185185185e-05, |
|
"loss": 0.8481, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 5.3592539077893075, |
|
"grad_norm": 1.1822712421417236, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.8435, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 5.4117956127676345, |
|
"grad_norm": 1.1157796382904053, |
|
"learning_rate": 3.648148148148148e-05, |
|
"loss": 0.8265, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 5.464337317745961, |
|
"grad_norm": 1.2513401508331299, |
|
"learning_rate": 3.62962962962963e-05, |
|
"loss": 0.8916, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 5.516879022724288, |
|
"grad_norm": 1.07020902633667, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 0.855, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 5.516879022724288, |
|
"eval_loss": 0.8260787725448608, |
|
"eval_runtime": 1.2426, |
|
"eval_samples_per_second": 8.048, |
|
"eval_steps_per_second": 4.024, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 5.569420727702614, |
|
"grad_norm": 1.281821370124817, |
|
"learning_rate": 3.592592592592593e-05, |
|
"loss": 0.8375, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 5.621962432680941, |
|
"grad_norm": 1.279232144355774, |
|
"learning_rate": 3.574074074074074e-05, |
|
"loss": 0.8374, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 5.674504137659267, |
|
"grad_norm": 1.6183032989501953, |
|
"learning_rate": 3.555555555555556e-05, |
|
"loss": 0.8396, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 5.727045842637594, |
|
"grad_norm": 1.5569584369659424, |
|
"learning_rate": 3.537037037037037e-05, |
|
"loss": 0.8474, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 5.77958754761592, |
|
"grad_norm": 0.9284697771072388, |
|
"learning_rate": 3.518518518518519e-05, |
|
"loss": 0.8447, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.77958754761592, |
|
"eval_loss": 0.82489413022995, |
|
"eval_runtime": 1.2928, |
|
"eval_samples_per_second": 7.735, |
|
"eval_steps_per_second": 3.868, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.832129252594247, |
|
"grad_norm": 1.129302740097046, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.8425, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 5.884670957572573, |
|
"grad_norm": 1.1644898653030396, |
|
"learning_rate": 3.481481481481482e-05, |
|
"loss": 0.8545, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.9372126625509, |
|
"grad_norm": 1.5778745412826538, |
|
"learning_rate": 3.4629629629629626e-05, |
|
"loss": 0.8488, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 5.989754367529226, |
|
"grad_norm": 1.0982458591461182, |
|
"learning_rate": 3.444444444444445e-05, |
|
"loss": 0.8368, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 6.042296072507553, |
|
"grad_norm": 1.3936185836791992, |
|
"learning_rate": 3.425925925925926e-05, |
|
"loss": 0.8165, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 6.042296072507553, |
|
"eval_loss": 0.8229165077209473, |
|
"eval_runtime": 1.2812, |
|
"eval_samples_per_second": 7.805, |
|
"eval_steps_per_second": 3.903, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 6.094837777485879, |
|
"grad_norm": 0.8609622716903687, |
|
"learning_rate": 3.4074074074074077e-05, |
|
"loss": 0.813, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 6.147379482464206, |
|
"grad_norm": 1.75618314743042, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 0.8285, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 6.199921187442532, |
|
"grad_norm": 1.5755348205566406, |
|
"learning_rate": 3.3703703703703706e-05, |
|
"loss": 0.8061, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 6.252462892420859, |
|
"grad_norm": 1.1621313095092773, |
|
"learning_rate": 3.351851851851852e-05, |
|
"loss": 0.8349, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 6.3050045973991855, |
|
"grad_norm": 1.3047136068344116, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.8247, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.3050045973991855, |
|
"eval_loss": 0.8217476010322571, |
|
"eval_runtime": 1.249, |
|
"eval_samples_per_second": 8.007, |
|
"eval_steps_per_second": 4.003, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.3575463023775125, |
|
"grad_norm": 1.161156415939331, |
|
"learning_rate": 3.314814814814815e-05, |
|
"loss": 0.8248, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 6.410088007355839, |
|
"grad_norm": 1.095155954360962, |
|
"learning_rate": 3.2962962962962964e-05, |
|
"loss": 0.8275, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 6.462629712334166, |
|
"grad_norm": 1.3355261087417603, |
|
"learning_rate": 3.277777777777778e-05, |
|
"loss": 0.833, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 6.515171417312492, |
|
"grad_norm": 1.5305536985397339, |
|
"learning_rate": 3.25925925925926e-05, |
|
"loss": 0.8259, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 6.567713122290819, |
|
"grad_norm": 2.5994789600372314, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 0.8459, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 6.567713122290819, |
|
"eval_loss": 0.8286579251289368, |
|
"eval_runtime": 1.2728, |
|
"eval_samples_per_second": 7.856, |
|
"eval_steps_per_second": 3.928, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 6.620254827269145, |
|
"grad_norm": 1.442550539970398, |
|
"learning_rate": 3.222222222222223e-05, |
|
"loss": 0.8294, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 6.672796532247471, |
|
"grad_norm": 1.1451183557510376, |
|
"learning_rate": 3.203703703703704e-05, |
|
"loss": 0.8284, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 6.725338237225798, |
|
"grad_norm": 1.0634784698486328, |
|
"learning_rate": 3.185185185185185e-05, |
|
"loss": 0.823, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 6.777879942204125, |
|
"grad_norm": 1.2910544872283936, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 0.8248, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 6.830421647182451, |
|
"grad_norm": 1.1855148077011108, |
|
"learning_rate": 3.148148148148148e-05, |
|
"loss": 0.854, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.830421647182451, |
|
"eval_loss": 0.8177132606506348, |
|
"eval_runtime": 1.2889, |
|
"eval_samples_per_second": 7.759, |
|
"eval_steps_per_second": 3.879, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.882963352160777, |
|
"grad_norm": 1.2636213302612305, |
|
"learning_rate": 3.1296296296296295e-05, |
|
"loss": 0.8429, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 6.935505057139104, |
|
"grad_norm": 1.2894644737243652, |
|
"learning_rate": 3.111111111111111e-05, |
|
"loss": 0.8279, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 6.988046762117431, |
|
"grad_norm": 1.64339280128479, |
|
"learning_rate": 3.0925925925925924e-05, |
|
"loss": 0.8217, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 7.040588467095757, |
|
"grad_norm": 0.9605098962783813, |
|
"learning_rate": 3.074074074074074e-05, |
|
"loss": 0.8375, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 7.093130172074083, |
|
"grad_norm": 1.4828298091888428, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 0.8079, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 7.093130172074083, |
|
"eval_loss": 0.8257712125778198, |
|
"eval_runtime": 1.2505, |
|
"eval_samples_per_second": 7.997, |
|
"eval_steps_per_second": 3.998, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 7.14567187705241, |
|
"grad_norm": 1.3807919025421143, |
|
"learning_rate": 3.037037037037037e-05, |
|
"loss": 0.8096, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 7.1982135820307365, |
|
"grad_norm": 1.5678062438964844, |
|
"learning_rate": 3.018518518518519e-05, |
|
"loss": 0.7869, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 7.2507552870090635, |
|
"grad_norm": 1.0710649490356445, |
|
"learning_rate": 3e-05, |
|
"loss": 0.813, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 7.30329699198739, |
|
"grad_norm": 1.150152564048767, |
|
"learning_rate": 2.981481481481482e-05, |
|
"loss": 0.7974, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 7.355838696965717, |
|
"grad_norm": 1.4336960315704346, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.8126, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.355838696965717, |
|
"eval_loss": 0.8250346183776855, |
|
"eval_runtime": 1.2966, |
|
"eval_samples_per_second": 7.712, |
|
"eval_steps_per_second": 3.856, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.408380401944043, |
|
"grad_norm": 1.454750895500183, |
|
"learning_rate": 2.9444444444444448e-05, |
|
"loss": 0.8534, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 7.46092210692237, |
|
"grad_norm": 1.3246815204620361, |
|
"learning_rate": 2.925925925925926e-05, |
|
"loss": 0.8313, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 7.513463811900696, |
|
"grad_norm": 1.116432547569275, |
|
"learning_rate": 2.9074074074074077e-05, |
|
"loss": 0.8085, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 7.566005516879023, |
|
"grad_norm": 1.0311377048492432, |
|
"learning_rate": 2.8888888888888888e-05, |
|
"loss": 0.7984, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 7.618547221857349, |
|
"grad_norm": 1.5071169137954712, |
|
"learning_rate": 2.8703703703703706e-05, |
|
"loss": 0.817, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 7.618547221857349, |
|
"eval_loss": 0.8244579434394836, |
|
"eval_runtime": 1.2449, |
|
"eval_samples_per_second": 8.032, |
|
"eval_steps_per_second": 4.016, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 7.671088926835676, |
|
"grad_norm": 1.2359856367111206, |
|
"learning_rate": 2.851851851851852e-05, |
|
"loss": 0.807, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 7.723630631814002, |
|
"grad_norm": 1.2495603561401367, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 0.8035, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 7.776172336792329, |
|
"grad_norm": 1.0622730255126953, |
|
"learning_rate": 2.814814814814815e-05, |
|
"loss": 0.8199, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 7.828714041770655, |
|
"grad_norm": 1.097486138343811, |
|
"learning_rate": 2.7962962962962965e-05, |
|
"loss": 0.8107, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 7.881255746748982, |
|
"grad_norm": 1.1956055164337158, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.7962, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.881255746748982, |
|
"eval_loss": 0.8224829435348511, |
|
"eval_runtime": 1.2586, |
|
"eval_samples_per_second": 7.946, |
|
"eval_steps_per_second": 3.973, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.933797451727308, |
|
"grad_norm": 1.154567837715149, |
|
"learning_rate": 2.7592592592592594e-05, |
|
"loss": 0.8278, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 7.986339156705635, |
|
"grad_norm": 1.4666606187820435, |
|
"learning_rate": 2.7407407407407408e-05, |
|
"loss": 0.8157, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 8.038880861683962, |
|
"grad_norm": 1.115820050239563, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 0.7899, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 8.091422566662288, |
|
"grad_norm": 1.4497051239013672, |
|
"learning_rate": 2.7037037037037037e-05, |
|
"loss": 0.7809, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 8.143964271640614, |
|
"grad_norm": 1.9520905017852783, |
|
"learning_rate": 2.6851851851851855e-05, |
|
"loss": 0.7972, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 8.143964271640614, |
|
"eval_loss": 0.8295345306396484, |
|
"eval_runtime": 1.2468, |
|
"eval_samples_per_second": 8.02, |
|
"eval_steps_per_second": 4.01, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 8.19650597661894, |
|
"grad_norm": 1.1650198698043823, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.7869, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 8.249047681597268, |
|
"grad_norm": 0.9997352957725525, |
|
"learning_rate": 2.6481481481481485e-05, |
|
"loss": 0.7786, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 8.301589386575595, |
|
"grad_norm": 1.173376202583313, |
|
"learning_rate": 2.6296296296296296e-05, |
|
"loss": 0.7903, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 8.35413109155392, |
|
"grad_norm": 1.3240666389465332, |
|
"learning_rate": 2.6111111111111114e-05, |
|
"loss": 0.7874, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 8.406672796532247, |
|
"grad_norm": 1.077017903327942, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.7835, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.406672796532247, |
|
"eval_loss": 0.8272402882575989, |
|
"eval_runtime": 1.2976, |
|
"eval_samples_per_second": 7.706, |
|
"eval_steps_per_second": 3.853, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.459214501510575, |
|
"grad_norm": 1.0706350803375244, |
|
"learning_rate": 2.5740740740740743e-05, |
|
"loss": 0.8083, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 8.5117562064889, |
|
"grad_norm": 1.0505809783935547, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 0.8085, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 8.564297911467227, |
|
"grad_norm": 1.0813027620315552, |
|
"learning_rate": 2.5370370370370372e-05, |
|
"loss": 0.7943, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 8.616839616445553, |
|
"grad_norm": 1.4488904476165771, |
|
"learning_rate": 2.5185185185185183e-05, |
|
"loss": 0.7885, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 8.66938132142388, |
|
"grad_norm": 1.1785513162612915, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7867, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 8.66938132142388, |
|
"eval_loss": 0.8290830850601196, |
|
"eval_runtime": 1.2989, |
|
"eval_samples_per_second": 7.699, |
|
"eval_steps_per_second": 3.849, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 8.721923026402207, |
|
"grad_norm": 1.331755518913269, |
|
"learning_rate": 2.4814814814814816e-05, |
|
"loss": 0.7887, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 8.774464731380533, |
|
"grad_norm": 1.0079772472381592, |
|
"learning_rate": 2.462962962962963e-05, |
|
"loss": 0.7831, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 8.82700643635886, |
|
"grad_norm": 1.6257929801940918, |
|
"learning_rate": 2.4444444444444445e-05, |
|
"loss": 0.788, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 8.879548141337187, |
|
"grad_norm": 1.5751090049743652, |
|
"learning_rate": 2.425925925925926e-05, |
|
"loss": 0.7864, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 8.932089846315513, |
|
"grad_norm": 1.0488544702529907, |
|
"learning_rate": 2.4074074074074074e-05, |
|
"loss": 0.8083, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.932089846315513, |
|
"eval_loss": 0.8188499212265015, |
|
"eval_runtime": 1.2935, |
|
"eval_samples_per_second": 7.731, |
|
"eval_steps_per_second": 3.865, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.98463155129384, |
|
"grad_norm": 1.8487260341644287, |
|
"learning_rate": 2.3888888888888892e-05, |
|
"loss": 0.7956, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 9.037173256272165, |
|
"grad_norm": 1.395914912223816, |
|
"learning_rate": 2.3703703703703707e-05, |
|
"loss": 0.7921, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 9.089714961250493, |
|
"grad_norm": 1.341793417930603, |
|
"learning_rate": 2.351851851851852e-05, |
|
"loss": 0.7975, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 9.14225666622882, |
|
"grad_norm": 1.0076344013214111, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.773, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 9.194798371207145, |
|
"grad_norm": 1.081777811050415, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.7654, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 9.194798371207145, |
|
"eval_loss": 0.8274938464164734, |
|
"eval_runtime": 1.2836, |
|
"eval_samples_per_second": 7.79, |
|
"eval_steps_per_second": 3.895, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 9.247340076185472, |
|
"grad_norm": 1.1430171728134155, |
|
"learning_rate": 2.2962962962962965e-05, |
|
"loss": 0.7956, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 9.2998817811638, |
|
"grad_norm": 1.0450961589813232, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 0.7774, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 9.352423486142126, |
|
"grad_norm": 1.0086592435836792, |
|
"learning_rate": 2.2592592592592594e-05, |
|
"loss": 0.7739, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 9.404965191120452, |
|
"grad_norm": 1.7970887422561646, |
|
"learning_rate": 2.240740740740741e-05, |
|
"loss": 0.7793, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 9.457506896098778, |
|
"grad_norm": 1.5623388290405273, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.7793, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.457506896098778, |
|
"eval_loss": 0.8288339376449585, |
|
"eval_runtime": 1.2843, |
|
"eval_samples_per_second": 7.786, |
|
"eval_steps_per_second": 3.893, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.510048601077106, |
|
"grad_norm": 1.1271198987960815, |
|
"learning_rate": 2.2037037037037038e-05, |
|
"loss": 0.7693, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 9.562590306055432, |
|
"grad_norm": 1.0703665018081665, |
|
"learning_rate": 2.1851851851851852e-05, |
|
"loss": 0.7734, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 9.615132011033758, |
|
"grad_norm": 1.5558685064315796, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 0.7778, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 9.667673716012084, |
|
"grad_norm": 1.932632565498352, |
|
"learning_rate": 2.148148148148148e-05, |
|
"loss": 0.7842, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 9.720215420990412, |
|
"grad_norm": 2.2928659915924072, |
|
"learning_rate": 2.1296296296296296e-05, |
|
"loss": 0.7761, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 9.720215420990412, |
|
"eval_loss": 0.8304858207702637, |
|
"eval_runtime": 1.2431, |
|
"eval_samples_per_second": 8.045, |
|
"eval_steps_per_second": 4.022, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 9.772757125968738, |
|
"grad_norm": 1.588517189025879, |
|
"learning_rate": 2.111111111111111e-05, |
|
"loss": 0.7738, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 9.825298830947064, |
|
"grad_norm": 1.4118093252182007, |
|
"learning_rate": 2.0925925925925925e-05, |
|
"loss": 0.7852, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 9.87784053592539, |
|
"grad_norm": 1.7786134481430054, |
|
"learning_rate": 2.074074074074074e-05, |
|
"loss": 0.789, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 9.930382240903718, |
|
"grad_norm": 1.4427546262741089, |
|
"learning_rate": 2.0555555555555555e-05, |
|
"loss": 0.8028, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 9.982923945882044, |
|
"grad_norm": 1.2574381828308105, |
|
"learning_rate": 2.037037037037037e-05, |
|
"loss": 0.7694, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.982923945882044, |
|
"eval_loss": 0.8281936645507812, |
|
"eval_runtime": 1.2455, |
|
"eval_samples_per_second": 8.029, |
|
"eval_steps_per_second": 4.015, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 10.03546565086037, |
|
"grad_norm": 1.5743870735168457, |
|
"learning_rate": 2.0185185185185187e-05, |
|
"loss": 0.7619, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 10.088007355838696, |
|
"grad_norm": 1.350224494934082, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7596, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 10.140549060817024, |
|
"grad_norm": 1.1520729064941406, |
|
"learning_rate": 1.9814814814814816e-05, |
|
"loss": 0.7561, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 10.19309076579535, |
|
"grad_norm": 1.2705488204956055, |
|
"learning_rate": 1.962962962962963e-05, |
|
"loss": 0.7659, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 10.245632470773677, |
|
"grad_norm": 1.1752071380615234, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 0.7597, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 10.245632470773677, |
|
"eval_loss": 0.8358650207519531, |
|
"eval_runtime": 1.2389, |
|
"eval_samples_per_second": 8.072, |
|
"eval_steps_per_second": 4.036, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 10.298174175752003, |
|
"grad_norm": 1.3173915147781372, |
|
"learning_rate": 1.925925925925926e-05, |
|
"loss": 0.7787, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 10.35071588073033, |
|
"grad_norm": 1.3181281089782715, |
|
"learning_rate": 1.9074074074074075e-05, |
|
"loss": 0.7559, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 10.403257585708657, |
|
"grad_norm": 1.1058646440505981, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 0.7659, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 10.455799290686983, |
|
"grad_norm": 1.2898966073989868, |
|
"learning_rate": 1.8703703703703704e-05, |
|
"loss": 0.7642, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 10.508340995665309, |
|
"grad_norm": 1.7255585193634033, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.7659, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.508340995665309, |
|
"eval_loss": 0.8337475061416626, |
|
"eval_runtime": 1.2724, |
|
"eval_samples_per_second": 7.859, |
|
"eval_steps_per_second": 3.93, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.560882700643635, |
|
"grad_norm": 1.101854681968689, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 0.7754, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 10.613424405621963, |
|
"grad_norm": 1.6675972938537598, |
|
"learning_rate": 1.814814814814815e-05, |
|
"loss": 0.7597, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 10.665966110600289, |
|
"grad_norm": 1.428888201713562, |
|
"learning_rate": 1.7962962962962965e-05, |
|
"loss": 0.7714, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 10.718507815578615, |
|
"grad_norm": 1.2556451559066772, |
|
"learning_rate": 1.777777777777778e-05, |
|
"loss": 0.7628, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 10.771049520556943, |
|
"grad_norm": 1.817168951034546, |
|
"learning_rate": 1.7592592592592595e-05, |
|
"loss": 0.7652, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 10.771049520556943, |
|
"eval_loss": 0.8289246559143066, |
|
"eval_runtime": 1.2657, |
|
"eval_samples_per_second": 7.901, |
|
"eval_steps_per_second": 3.95, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 10.823591225535269, |
|
"grad_norm": 0.9452287554740906, |
|
"learning_rate": 1.740740740740741e-05, |
|
"loss": 0.7661, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 10.876132930513595, |
|
"grad_norm": 1.0433977842330933, |
|
"learning_rate": 1.7222222222222224e-05, |
|
"loss": 0.7662, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 10.928674635491921, |
|
"grad_norm": 1.7316306829452515, |
|
"learning_rate": 1.7037037037037038e-05, |
|
"loss": 0.7694, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 10.981216340470247, |
|
"grad_norm": 1.1855412721633911, |
|
"learning_rate": 1.6851851851851853e-05, |
|
"loss": 0.7608, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 11.033758045448575, |
|
"grad_norm": 1.2221699953079224, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7538, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 11.033758045448575, |
|
"eval_loss": 0.8385075330734253, |
|
"eval_runtime": 1.2472, |
|
"eval_samples_per_second": 8.018, |
|
"eval_steps_per_second": 4.009, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 11.086299750426901, |
|
"grad_norm": 1.1642967462539673, |
|
"learning_rate": 1.6481481481481482e-05, |
|
"loss": 0.7348, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 11.138841455405228, |
|
"grad_norm": 1.7363336086273193, |
|
"learning_rate": 1.62962962962963e-05, |
|
"loss": 0.756, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 11.191383160383554, |
|
"grad_norm": 1.3126277923583984, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 0.7515, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 11.243924865361882, |
|
"grad_norm": 2.1372787952423096, |
|
"learning_rate": 1.5925925925925926e-05, |
|
"loss": 0.7487, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 11.296466570340208, |
|
"grad_norm": 1.5635650157928467, |
|
"learning_rate": 1.574074074074074e-05, |
|
"loss": 0.7554, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 11.296466570340208, |
|
"eval_loss": 0.8373192548751831, |
|
"eval_runtime": 1.2681, |
|
"eval_samples_per_second": 7.886, |
|
"eval_steps_per_second": 3.943, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 11.349008275318534, |
|
"grad_norm": 1.708914875984192, |
|
"learning_rate": 1.5555555555555555e-05, |
|
"loss": 0.7648, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 11.40154998029686, |
|
"grad_norm": 1.1601842641830444, |
|
"learning_rate": 1.537037037037037e-05, |
|
"loss": 0.7614, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 11.454091685275188, |
|
"grad_norm": 1.1674526929855347, |
|
"learning_rate": 1.5185185185185186e-05, |
|
"loss": 0.7653, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 11.506633390253514, |
|
"grad_norm": 1.2474111318588257, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.7596, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 11.55917509523184, |
|
"grad_norm": 1.2556248903274536, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.7445, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.55917509523184, |
|
"eval_loss": 0.8322497606277466, |
|
"eval_runtime": 1.2428, |
|
"eval_samples_per_second": 8.046, |
|
"eval_steps_per_second": 4.023, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.611716800210166, |
|
"grad_norm": 1.5506947040557861, |
|
"learning_rate": 1.462962962962963e-05, |
|
"loss": 0.7449, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 11.664258505188494, |
|
"grad_norm": 1.124089241027832, |
|
"learning_rate": 1.4444444444444444e-05, |
|
"loss": 0.7613, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 11.71680021016682, |
|
"grad_norm": 1.5663940906524658, |
|
"learning_rate": 1.425925925925926e-05, |
|
"loss": 0.7447, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 11.769341915145146, |
|
"grad_norm": 1.4085638523101807, |
|
"learning_rate": 1.4074074074074075e-05, |
|
"loss": 0.745, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 11.821883620123472, |
|
"grad_norm": 1.8693653345108032, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.7615, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 11.821883620123472, |
|
"eval_loss": 0.8351184129714966, |
|
"eval_runtime": 1.2433, |
|
"eval_samples_per_second": 8.043, |
|
"eval_steps_per_second": 4.022, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 11.8744253251018, |
|
"grad_norm": 1.807320237159729, |
|
"learning_rate": 1.3703703703703704e-05, |
|
"loss": 0.7605, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 11.926967030080126, |
|
"grad_norm": 1.1300692558288574, |
|
"learning_rate": 1.3518518518518519e-05, |
|
"loss": 0.7604, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 11.979508735058452, |
|
"grad_norm": 1.5714143514633179, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.7462, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 12.032050440036778, |
|
"grad_norm": 1.1730937957763672, |
|
"learning_rate": 1.3148148148148148e-05, |
|
"loss": 0.7655, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 12.084592145015106, |
|
"grad_norm": 1.1946797370910645, |
|
"learning_rate": 1.2962962962962962e-05, |
|
"loss": 0.7326, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.084592145015106, |
|
"eval_loss": 0.8375317454338074, |
|
"eval_runtime": 1.2867, |
|
"eval_samples_per_second": 7.772, |
|
"eval_steps_per_second": 3.886, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.137133849993432, |
|
"grad_norm": 1.1328163146972656, |
|
"learning_rate": 1.2777777777777777e-05, |
|
"loss": 0.7439, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 12.189675554971759, |
|
"grad_norm": 1.026856780052185, |
|
"learning_rate": 1.2592592592592592e-05, |
|
"loss": 0.7589, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 12.242217259950085, |
|
"grad_norm": 1.920116662979126, |
|
"learning_rate": 1.2407407407407408e-05, |
|
"loss": 0.7458, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 12.294758964928413, |
|
"grad_norm": 1.3238016366958618, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 0.74, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 12.347300669906739, |
|
"grad_norm": 1.192124843597412, |
|
"learning_rate": 1.2037037037037037e-05, |
|
"loss": 0.7369, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 12.347300669906739, |
|
"eval_loss": 0.8405577540397644, |
|
"eval_runtime": 1.2401, |
|
"eval_samples_per_second": 8.064, |
|
"eval_steps_per_second": 4.032, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 12.399842374885065, |
|
"grad_norm": 1.3931485414505005, |
|
"learning_rate": 1.1851851851851853e-05, |
|
"loss": 0.7619, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 12.452384079863391, |
|
"grad_norm": 1.071673035621643, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.753, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 12.504925784841719, |
|
"grad_norm": 1.2407554388046265, |
|
"learning_rate": 1.1481481481481482e-05, |
|
"loss": 0.7382, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 12.557467489820045, |
|
"grad_norm": 1.2360953092575073, |
|
"learning_rate": 1.1296296296296297e-05, |
|
"loss": 0.7348, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 12.610009194798371, |
|
"grad_norm": 1.2687872648239136, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.75, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.610009194798371, |
|
"eval_loss": 0.8404332399368286, |
|
"eval_runtime": 1.2513, |
|
"eval_samples_per_second": 7.992, |
|
"eval_steps_per_second": 3.996, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.662550899776697, |
|
"grad_norm": 1.1866815090179443, |
|
"learning_rate": 1.0925925925925926e-05, |
|
"loss": 0.7419, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 12.715092604755025, |
|
"grad_norm": 1.715956687927246, |
|
"learning_rate": 1.074074074074074e-05, |
|
"loss": 0.7406, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 12.767634309733351, |
|
"grad_norm": 1.1529712677001953, |
|
"learning_rate": 1.0555555555555555e-05, |
|
"loss": 0.7516, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 12.820176014711677, |
|
"grad_norm": 1.2465269565582275, |
|
"learning_rate": 1.037037037037037e-05, |
|
"loss": 0.7407, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 12.872717719690003, |
|
"grad_norm": 1.1018378734588623, |
|
"learning_rate": 1.0185185185185185e-05, |
|
"loss": 0.7514, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 12.872717719690003, |
|
"eval_loss": 0.8341609835624695, |
|
"eval_runtime": 1.2427, |
|
"eval_samples_per_second": 8.047, |
|
"eval_steps_per_second": 4.024, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 12.925259424668331, |
|
"grad_norm": 1.2097246646881104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7298, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 12.977801129646657, |
|
"grad_norm": 1.5760364532470703, |
|
"learning_rate": 9.814814814814815e-06, |
|
"loss": 0.7538, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 13.030342834624983, |
|
"grad_norm": 1.2670038938522339, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 0.7437, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 13.08288453960331, |
|
"grad_norm": 1.178148627281189, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.7392, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 13.135426244581637, |
|
"grad_norm": 1.2803364992141724, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.7178, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 13.135426244581637, |
|
"eval_loss": 0.8406384587287903, |
|
"eval_runtime": 1.2436, |
|
"eval_samples_per_second": 8.041, |
|
"eval_steps_per_second": 4.02, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 13.187967949559964, |
|
"grad_norm": 1.3507287502288818, |
|
"learning_rate": 9.074074074074075e-06, |
|
"loss": 0.7473, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 13.24050965453829, |
|
"grad_norm": 1.1901633739471436, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 0.7386, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 13.293051359516616, |
|
"grad_norm": 1.25368332862854, |
|
"learning_rate": 8.703703703703705e-06, |
|
"loss": 0.7426, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 13.345593064494944, |
|
"grad_norm": 1.4581819772720337, |
|
"learning_rate": 8.518518518518519e-06, |
|
"loss": 0.7479, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 13.39813476947327, |
|
"grad_norm": 1.201744556427002, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.7266, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 13.39813476947327, |
|
"eval_loss": 0.8411173820495605, |
|
"eval_runtime": 1.2635, |
|
"eval_samples_per_second": 7.915, |
|
"eval_steps_per_second": 3.957, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 13.450676474451596, |
|
"grad_norm": 1.3724862337112427, |
|
"learning_rate": 8.14814814814815e-06, |
|
"loss": 0.7392, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 13.503218179429922, |
|
"grad_norm": 1.2667688131332397, |
|
"learning_rate": 7.962962962962963e-06, |
|
"loss": 0.731, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 13.55575988440825, |
|
"grad_norm": 1.3549320697784424, |
|
"learning_rate": 7.777777777777777e-06, |
|
"loss": 0.7307, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 13.608301589386576, |
|
"grad_norm": 1.3319281339645386, |
|
"learning_rate": 7.592592592592593e-06, |
|
"loss": 0.7315, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 13.660843294364902, |
|
"grad_norm": 1.3759852647781372, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.7338, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.660843294364902, |
|
"eval_loss": 0.8397306203842163, |
|
"eval_runtime": 1.2452, |
|
"eval_samples_per_second": 8.031, |
|
"eval_steps_per_second": 4.015, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.713384999343228, |
|
"grad_norm": 1.452658772468567, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 0.7339, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 13.765926704321554, |
|
"grad_norm": 1.1617493629455566, |
|
"learning_rate": 7.0370370370370375e-06, |
|
"loss": 0.737, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 13.818468409299882, |
|
"grad_norm": 1.628431797027588, |
|
"learning_rate": 6.851851851851852e-06, |
|
"loss": 0.7437, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 13.871010114278208, |
|
"grad_norm": 1.4544808864593506, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.7411, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 13.923551819256534, |
|
"grad_norm": 1.7027857303619385, |
|
"learning_rate": 6.481481481481481e-06, |
|
"loss": 0.7432, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 13.923551819256534, |
|
"eval_loss": 0.8381926417350769, |
|
"eval_runtime": 1.2458, |
|
"eval_samples_per_second": 8.027, |
|
"eval_steps_per_second": 4.013, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 13.976093524234862, |
|
"grad_norm": 1.320162296295166, |
|
"learning_rate": 6.296296296296296e-06, |
|
"loss": 0.7463, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 14.028635229213188, |
|
"grad_norm": 1.2044928073883057, |
|
"learning_rate": 6.111111111111111e-06, |
|
"loss": 0.7358, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 14.081176934191515, |
|
"grad_norm": 1.0798685550689697, |
|
"learning_rate": 5.925925925925927e-06, |
|
"loss": 0.7416, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 14.13371863916984, |
|
"grad_norm": 1.4219785928726196, |
|
"learning_rate": 5.740740740740741e-06, |
|
"loss": 0.7256, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 14.186260344148167, |
|
"grad_norm": 1.1347541809082031, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.7322, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 14.186260344148167, |
|
"eval_loss": 0.8432482481002808, |
|
"eval_runtime": 1.2881, |
|
"eval_samples_per_second": 7.763, |
|
"eval_steps_per_second": 3.882, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 14.238802049126495, |
|
"grad_norm": 1.5265473127365112, |
|
"learning_rate": 5.37037037037037e-06, |
|
"loss": 0.7281, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 14.29134375410482, |
|
"grad_norm": 1.0720326900482178, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 0.731, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 14.343885459083147, |
|
"grad_norm": 1.5150460004806519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.719, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 14.396427164061473, |
|
"grad_norm": 1.2847768068313599, |
|
"learning_rate": 4.814814814814815e-06, |
|
"loss": 0.7307, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 14.4489688690398, |
|
"grad_norm": 1.4064009189605713, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 0.7293, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 14.4489688690398, |
|
"eval_loss": 0.8425260782241821, |
|
"eval_runtime": 1.247, |
|
"eval_samples_per_second": 8.019, |
|
"eval_steps_per_second": 4.01, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 14.501510574018127, |
|
"grad_norm": 1.1766741275787354, |
|
"learning_rate": 4.444444444444445e-06, |
|
"loss": 0.7406, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 14.554052278996453, |
|
"grad_norm": 1.3252798318862915, |
|
"learning_rate": 4.2592592592592596e-06, |
|
"loss": 0.7377, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 14.60659398397478, |
|
"grad_norm": 1.2687978744506836, |
|
"learning_rate": 4.074074074074075e-06, |
|
"loss": 0.7303, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 14.659135688953107, |
|
"grad_norm": 1.29690420627594, |
|
"learning_rate": 3.888888888888889e-06, |
|
"loss": 0.7349, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 14.711677393931433, |
|
"grad_norm": 1.4593523740768433, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.7327, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.711677393931433, |
|
"eval_loss": 0.842442512512207, |
|
"eval_runtime": 1.2608, |
|
"eval_samples_per_second": 7.931, |
|
"eval_steps_per_second": 3.966, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.76421909890976, |
|
"grad_norm": 1.1667600870132446, |
|
"learning_rate": 3.5185185185185187e-06, |
|
"loss": 0.7276, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 14.816760803888085, |
|
"grad_norm": 1.0612125396728516, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.7468, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 14.869302508866413, |
|
"grad_norm": 1.117440938949585, |
|
"learning_rate": 3.148148148148148e-06, |
|
"loss": 0.7189, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 14.92184421384474, |
|
"grad_norm": 1.1088405847549438, |
|
"learning_rate": 2.9629629629629633e-06, |
|
"loss": 0.732, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 14.974385918823065, |
|
"grad_norm": 1.381748914718628, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 0.7368, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 14.974385918823065, |
|
"eval_loss": 0.844028651714325, |
|
"eval_runtime": 1.2451, |
|
"eval_samples_per_second": 8.031, |
|
"eval_steps_per_second": 4.016, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 15.026927623801392, |
|
"grad_norm": 1.2738327980041504, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 0.7411, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 15.07946932877972, |
|
"grad_norm": 1.450596809387207, |
|
"learning_rate": 2.4074074074074075e-06, |
|
"loss": 0.721, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 15.132011033758046, |
|
"grad_norm": 1.2589495182037354, |
|
"learning_rate": 2.2222222222222225e-06, |
|
"loss": 0.7134, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 15.184552738736372, |
|
"grad_norm": 1.1227418184280396, |
|
"learning_rate": 2.0370370370370375e-06, |
|
"loss": 0.7361, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 15.237094443714698, |
|
"grad_norm": 1.1688610315322876, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.722, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 15.237094443714698, |
|
"eval_loss": 0.8448740243911743, |
|
"eval_runtime": 1.2712, |
|
"eval_samples_per_second": 7.867, |
|
"eval_steps_per_second": 3.933, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 15.289636148693026, |
|
"grad_norm": 1.4582873582839966, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.7236, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 15.342177853671352, |
|
"grad_norm": 1.5905903577804565, |
|
"learning_rate": 1.4814814814814817e-06, |
|
"loss": 0.7332, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 15.394719558649678, |
|
"grad_norm": 2.0454487800598145, |
|
"learning_rate": 1.2962962962962962e-06, |
|
"loss": 0.7222, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 15.447261263628004, |
|
"grad_norm": 1.3177134990692139, |
|
"learning_rate": 1.1111111111111112e-06, |
|
"loss": 0.7362, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 15.499802968606332, |
|
"grad_norm": 1.0010744333267212, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 0.7304, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 15.499802968606332, |
|
"eval_loss": 0.84196537733078, |
|
"eval_runtime": 1.2445, |
|
"eval_samples_per_second": 8.035, |
|
"eval_steps_per_second": 4.018, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 15.552344673584658, |
|
"grad_norm": 1.296751618385315, |
|
"learning_rate": 7.407407407407408e-07, |
|
"loss": 0.719, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 15.604886378562984, |
|
"grad_norm": 1.198317050933838, |
|
"learning_rate": 5.555555555555556e-07, |
|
"loss": 0.7359, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 15.65742808354131, |
|
"grad_norm": 1.3005775213241577, |
|
"learning_rate": 3.703703703703704e-07, |
|
"loss": 0.7268, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 15.709969788519638, |
|
"grad_norm": 1.1755659580230713, |
|
"learning_rate": 1.851851851851852e-07, |
|
"loss": 0.7266, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 15.762511493497964, |
|
"grad_norm": 1.1558780670166016, |
|
"learning_rate": 0.0, |
|
"loss": 0.7389, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 15.762511493497964, |
|
"eval_loss": 0.8437487483024597, |
|
"eval_runtime": 1.263, |
|
"eval_samples_per_second": 7.918, |
|
"eval_steps_per_second": 3.959, |
|
"step": 30000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 30000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 16, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.658737511387988e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|