{ "best_metric": null, "best_model_checkpoint": null, "epoch": 38.0, "eval_steps": 500, "global_step": 20102, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.5107442140579224, "learning_rate": 5e-05, "loss": 2.0482, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.5563424229621887, "learning_rate": 5e-05, "loss": 1.8835, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.4735371172428131, "learning_rate": 5e-05, "loss": 1.8512, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.4543495178222656, "learning_rate": 5e-05, "loss": 1.8489, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.5007112622261047, "learning_rate": 5e-05, "loss": 1.8439, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.5733333333333334, "eval_loss": 1.6194567680358887, "eval_runtime": 5.4154, "eval_samples_per_second": 92.33, "eval_steps_per_second": 11.634, "step": 529 }, { "epoch": 1.0, "eval_exact_match": 17.0, "eval_f1": 26.880952380952387, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.5181360244750977, "learning_rate": 5e-05, "loss": 1.8217, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 1.1348918676376343, "learning_rate": 5e-05, "loss": 1.7866, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.661758303642273, "learning_rate": 5e-05, "loss": 1.7934, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.5634772777557373, "learning_rate": 5e-05, "loss": 1.7731, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.6698930263519287, "learning_rate": 5e-05, "loss": 1.7969, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.5751794871794872, "eval_loss": 1.603736162185669, "eval_runtime": 5.0986, "eval_samples_per_second": 98.066, "eval_steps_per_second": 12.356, "step": 1058 }, { "epoch": 2.0, "eval_exact_match": 17.6, "eval_f1": 27.218095238095238, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.6265193223953247, "learning_rate": 5e-05, "loss": 1.7416, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.812364935874939, "learning_rate": 5e-05, "loss": 1.6957, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 1.0972926616668701, "learning_rate": 5e-05, "loss": 1.7033, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 1.0831658840179443, "learning_rate": 5e-05, "loss": 1.6935, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 1.0022494792938232, "learning_rate": 5e-05, "loss": 1.6799, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.573025641025641, "eval_loss": 1.6428505182266235, "eval_runtime": 5.1488, "eval_samples_per_second": 97.111, "eval_steps_per_second": 12.236, "step": 1587 }, { "epoch": 3.0, "eval_exact_match": 20.2, "eval_f1": 27.93809523809523, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.8240587711334229, "learning_rate": 5e-05, "loss": 1.6618, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 1.2994645833969116, "learning_rate": 5e-05, "loss": 1.5736, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 1.0891159772872925, "learning_rate": 5e-05, "loss": 1.5798, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 1.3826475143432617, "learning_rate": 5e-05, "loss": 1.5957, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 0.9901586174964905, "learning_rate": 5e-05, "loss": 1.6026, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 1.0607813596725464, "learning_rate": 5e-05, "loss": 1.6106, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.5708717948717948, "eval_loss": 1.7143596410751343, "eval_runtime": 5.3942, "eval_samples_per_second": 92.693, "eval_steps_per_second": 11.679, "step": 2116 }, { "epoch": 4.0, "eval_exact_match": 19.0, "eval_f1": 27.71428571428571, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 1.3189178705215454, "learning_rate": 5e-05, "loss": 1.5029, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 1.062267541885376, "learning_rate": 5e-05, "loss": 1.51, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 1.3810843229293823, "learning_rate": 5e-05, "loss": 1.4867, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 1.1991653442382812, "learning_rate": 5e-05, "loss": 1.501, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 1.310654878616333, "learning_rate": 5e-05, "loss": 1.5034, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.5681025641025641, "eval_loss": 1.8328346014022827, "eval_runtime": 5.7763, "eval_samples_per_second": 86.561, "eval_steps_per_second": 10.907, "step": 2645 }, { "epoch": 5.0, "eval_exact_match": 18.8, "eval_f1": 26.6015873015873, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 1.1406631469726562, "learning_rate": 5e-05, "loss": 1.4605, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 1.447221279144287, "learning_rate": 5e-05, "loss": 1.4204, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 1.461799144744873, "learning_rate": 5e-05, "loss": 1.4233, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 1.4184092283248901, "learning_rate": 5e-05, "loss": 1.4265, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 0.9850455522537231, "learning_rate": 5e-05, "loss": 1.4139, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.564974358974359, "eval_loss": 1.931396484375, "eval_runtime": 5.3358, "eval_samples_per_second": 93.706, "eval_steps_per_second": 11.807, "step": 3174 }, { "epoch": 6.0, "eval_exact_match": 17.6, "eval_f1": 25.464285714285715, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 1.2274116277694702, "learning_rate": 5e-05, "loss": 1.4159, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 1.2107988595962524, "learning_rate": 5e-05, "loss": 1.3579, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 1.2965952157974243, "learning_rate": 5e-05, "loss": 1.3485, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 1.3035151958465576, "learning_rate": 5e-05, "loss": 1.36, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 1.1840296983718872, "learning_rate": 5e-05, "loss": 1.3599, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 1.780287265777588, "learning_rate": 5e-05, "loss": 1.355, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.565076923076923, "eval_loss": 1.9659509658813477, "eval_runtime": 5.3127, "eval_samples_per_second": 94.114, "eval_steps_per_second": 11.858, "step": 3703 }, { "epoch": 7.0, "eval_exact_match": 18.0, "eval_f1": 26.546666666666663, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 1.1608421802520752, "learning_rate": 5e-05, "loss": 1.298, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 1.083270788192749, "learning_rate": 5e-05, "loss": 1.3088, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 1.3489899635314941, "learning_rate": 5e-05, "loss": 1.299, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 1.1663447618484497, "learning_rate": 5e-05, "loss": 1.2841, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 1.9308583736419678, "learning_rate": 5e-05, "loss": 1.314, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.5616923076923077, "eval_loss": 2.1164443492889404, "eval_runtime": 5.2165, "eval_samples_per_second": 95.85, "eval_steps_per_second": 12.077, "step": 4232 }, { "epoch": 8.0, "eval_exact_match": 17.0, "eval_f1": 24.659841269841277, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 1.109632134437561, "learning_rate": 5e-05, "loss": 1.2558, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 1.1132491827011108, "learning_rate": 5e-05, "loss": 1.2264, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 1.1017369031906128, "learning_rate": 5e-05, "loss": 1.2649, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 1.0874963998794556, "learning_rate": 5e-05, "loss": 1.2512, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 1.2617064714431763, "learning_rate": 5e-05, "loss": 1.261, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.5622051282051282, "eval_loss": 2.1635327339172363, "eval_runtime": 5.0832, "eval_samples_per_second": 98.363, "eval_steps_per_second": 12.394, "step": 4761 }, { "epoch": 9.0, "eval_exact_match": 17.8, "eval_f1": 25.451428571428576, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 1.2258027791976929, "learning_rate": 5e-05, "loss": 1.2298, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 1.8261148929595947, "learning_rate": 5e-05, "loss": 1.1748, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 1.1221072673797607, "learning_rate": 5e-05, "loss": 1.19, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 1.3320170640945435, "learning_rate": 5e-05, "loss": 1.1932, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 1.194077730178833, "learning_rate": 5e-05, "loss": 1.217, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.5612820512820513, "eval_loss": 2.244041919708252, "eval_runtime": 5.1606, "eval_samples_per_second": 96.889, "eval_steps_per_second": 12.208, "step": 5290 }, { "epoch": 10.0, "eval_exact_match": 18.4, "eval_f1": 26.067662337662348, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 1.1352814435958862, "learning_rate": 5e-05, "loss": 1.2093, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 1.2741838693618774, "learning_rate": 5e-05, "loss": 1.1467, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 1.6901249885559082, "learning_rate": 5e-05, "loss": 1.1288, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 1.1944465637207031, "learning_rate": 5e-05, "loss": 1.1399, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 1.2234572172164917, "learning_rate": 5e-05, "loss": 1.1678, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 1.2781612873077393, "learning_rate": 5e-05, "loss": 1.146, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.5618461538461539, "eval_loss": 2.296630859375, "eval_runtime": 5.1078, "eval_samples_per_second": 97.889, "eval_steps_per_second": 12.334, "step": 5819 }, { "epoch": 11.0, "eval_exact_match": 16.8, "eval_f1": 23.63305916305917, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 1.138975739479065, "learning_rate": 5e-05, "loss": 1.0842, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 1.727989912033081, "learning_rate": 5e-05, "loss": 1.0806, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 1.3104358911514282, "learning_rate": 5e-05, "loss": 1.0953, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 1.3629850149154663, "learning_rate": 5e-05, "loss": 1.0945, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 1.3276056051254272, "learning_rate": 5e-05, "loss": 1.1069, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.562, "eval_loss": 2.3118155002593994, "eval_runtime": 5.3837, "eval_samples_per_second": 92.873, "eval_steps_per_second": 11.702, "step": 6348 }, { "epoch": 12.0, "eval_exact_match": 17.4, "eval_f1": 24.43924963924964, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 1.2608078718185425, "learning_rate": 5e-05, "loss": 1.0418, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 1.4456945657730103, "learning_rate": 5e-05, "loss": 1.0258, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 1.4195287227630615, "learning_rate": 5e-05, "loss": 1.0255, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 1.2595598697662354, "learning_rate": 5e-05, "loss": 1.0511, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 1.5901292562484741, "learning_rate": 5e-05, "loss": 1.045, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.5614871794871795, "eval_loss": 2.3580987453460693, "eval_runtime": 5.3496, "eval_samples_per_second": 93.466, "eval_steps_per_second": 11.777, "step": 6877 }, { "epoch": 13.0, "eval_exact_match": 17.2, "eval_f1": 24.350995670995676, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 1.471511721611023, "learning_rate": 5e-05, "loss": 1.0404, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 1.2928757667541504, "learning_rate": 5e-05, "loss": 0.9573, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 1.5545622110366821, "learning_rate": 5e-05, "loss": 0.9583, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 1.6132512092590332, "learning_rate": 5e-05, "loss": 0.981, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 1.7548742294311523, "learning_rate": 5e-05, "loss": 0.9999, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 1.4219907522201538, "learning_rate": 5e-05, "loss": 1.0028, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.5618974358974359, "eval_loss": 2.413728952407837, "eval_runtime": 5.0464, "eval_samples_per_second": 99.08, "eval_steps_per_second": 12.484, "step": 7406 }, { "epoch": 14.0, "eval_exact_match": 16.6, "eval_f1": 24.201789321789327, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 1.5799965858459473, "learning_rate": 5e-05, "loss": 0.9096, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 1.5961594581604004, "learning_rate": 5e-05, "loss": 0.9075, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 1.5756512880325317, "learning_rate": 5e-05, "loss": 0.9222, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 1.515251636505127, "learning_rate": 5e-05, "loss": 0.9445, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 2.6804299354553223, "learning_rate": 5e-05, "loss": 0.9153, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.5596923076923077, "eval_loss": 2.4756176471710205, "eval_runtime": 5.1956, "eval_samples_per_second": 96.236, "eval_steps_per_second": 12.126, "step": 7935 }, { "epoch": 15.0, "eval_exact_match": 16.8, "eval_f1": 24.146709956709966, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 1.7786214351654053, "learning_rate": 5e-05, "loss": 0.8874, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 1.709060788154602, "learning_rate": 5e-05, "loss": 0.8538, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 2.3245434761047363, "learning_rate": 5e-05, "loss": 0.8591, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 1.6101887226104736, "learning_rate": 5e-05, "loss": 0.8593, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 1.5917792320251465, "learning_rate": 5e-05, "loss": 0.8748, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.5604615384615385, "eval_loss": 2.4434447288513184, "eval_runtime": 5.1072, "eval_samples_per_second": 97.901, "eval_steps_per_second": 12.336, "step": 8464 }, { "epoch": 16.0, "eval_exact_match": 15.6, "eval_f1": 22.917027417027416, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 1.6529980897903442, "learning_rate": 5e-05, "loss": 0.8413, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 1.931520938873291, "learning_rate": 5e-05, "loss": 0.7863, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 1.8401894569396973, "learning_rate": 5e-05, "loss": 0.799, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 1.7952015399932861, "learning_rate": 5e-05, "loss": 0.8238, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 1.8028634786605835, "learning_rate": 5e-05, "loss": 0.807, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.5606153846153846, "eval_loss": 2.4520959854125977, "eval_runtime": 5.1769, "eval_samples_per_second": 96.584, "eval_steps_per_second": 12.17, "step": 8993 }, { "epoch": 17.0, "eval_exact_match": 17.4, "eval_f1": 25.104011544011545, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 1.7197688817977905, "learning_rate": 5e-05, "loss": 0.8249, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 1.768463134765625, "learning_rate": 5e-05, "loss": 0.7304, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 1.9696024656295776, "learning_rate": 5e-05, "loss": 0.7445, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 2.093703269958496, "learning_rate": 5e-05, "loss": 0.755, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 1.909550666809082, "learning_rate": 5e-05, "loss": 0.7698, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 1.6835806369781494, "learning_rate": 5e-05, "loss": 0.7711, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.5603589743589743, "eval_loss": 2.473491668701172, "eval_runtime": 5.4257, "eval_samples_per_second": 92.155, "eval_steps_per_second": 11.611, "step": 9522 }, { "epoch": 18.0, "eval_exact_match": 17.0, "eval_f1": 24.074487734487732, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 2.0483736991882324, "learning_rate": 5e-05, "loss": 0.7048, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 1.987457275390625, "learning_rate": 5e-05, "loss": 0.6863, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 2.1744563579559326, "learning_rate": 5e-05, "loss": 0.7045, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 1.9871633052825928, "learning_rate": 5e-05, "loss": 0.7076, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 1.8895001411437988, "learning_rate": 5e-05, "loss": 0.7202, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.5601538461538461, "eval_loss": 2.5168700218200684, "eval_runtime": 5.0434, "eval_samples_per_second": 99.14, "eval_steps_per_second": 12.492, "step": 10051 }, { "epoch": 19.0, "eval_exact_match": 17.2, "eval_f1": 24.260360750360757, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 1.7914209365844727, "learning_rate": 5e-05, "loss": 0.6728, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 2.198495388031006, "learning_rate": 5e-05, "loss": 0.6241, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 1.81365966796875, "learning_rate": 5e-05, "loss": 0.6558, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 1.6571800708770752, "learning_rate": 5e-05, "loss": 0.6652, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 1.6469954252243042, "learning_rate": 5e-05, "loss": 0.6637, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.5596410256410257, "eval_loss": 2.516073703765869, "eval_runtime": 5.8234, "eval_samples_per_second": 85.86, "eval_steps_per_second": 10.818, "step": 10580 }, { "epoch": 20.0, "eval_exact_match": 17.6, "eval_f1": 25.225079365079367, "step": 10580 }, { "epoch": 20.03780718336484, "grad_norm": 1.7114077806472778, "learning_rate": 5e-05, "loss": 0.66, "step": 10600 }, { "epoch": 20.226843100189036, "grad_norm": 1.9962202310562134, "learning_rate": 5e-05, "loss": 0.5938, "step": 10700 }, { "epoch": 20.415879017013232, "grad_norm": 2.397592067718506, "learning_rate": 5e-05, "loss": 0.6042, "step": 10800 }, { "epoch": 20.604914933837428, "grad_norm": 2.0724213123321533, "learning_rate": 5e-05, "loss": 0.6124, "step": 10900 }, { "epoch": 20.793950850661627, "grad_norm": 1.9385457038879395, "learning_rate": 5e-05, "loss": 0.6078, "step": 11000 }, { "epoch": 20.982986767485823, "grad_norm": 2.034963607788086, "learning_rate": 5e-05, "loss": 0.6257, "step": 11100 }, { "epoch": 21.0, "eval_accuracy": 0.5597948717948718, "eval_loss": 2.5497183799743652, "eval_runtime": 5.0442, "eval_samples_per_second": 99.123, "eval_steps_per_second": 12.489, "step": 11109 }, { "epoch": 21.0, "eval_exact_match": 17.4, "eval_f1": 24.755238095238095, "step": 11109 }, { "epoch": 21.17202268431002, "grad_norm": 1.9768177270889282, "learning_rate": 5e-05, "loss": 0.5458, "step": 11200 }, { "epoch": 21.361058601134214, "grad_norm": 2.14420747756958, "learning_rate": 5e-05, "loss": 0.558, "step": 11300 }, { "epoch": 21.550094517958414, "grad_norm": 2.5267951488494873, "learning_rate": 5e-05, "loss": 0.5646, "step": 11400 }, { "epoch": 21.73913043478261, "grad_norm": 2.114386558532715, "learning_rate": 5e-05, "loss": 0.572, "step": 11500 }, { "epoch": 21.928166351606805, "grad_norm": 2.424217700958252, "learning_rate": 5e-05, "loss": 0.5928, "step": 11600 }, { "epoch": 22.0, "eval_accuracy": 0.5593846153846154, "eval_loss": 2.579535722732544, "eval_runtime": 5.7849, "eval_samples_per_second": 86.433, "eval_steps_per_second": 10.89, "step": 11638 }, { "epoch": 22.0, "eval_exact_match": 16.8, "eval_f1": 23.666507936507937, "step": 11638 }, { "epoch": 22.117202268431, "grad_norm": 1.7813427448272705, "learning_rate": 5e-05, "loss": 0.5342, "step": 11700 }, { "epoch": 22.3062381852552, "grad_norm": 1.989479422569275, "learning_rate": 5e-05, "loss": 0.5157, "step": 11800 }, { "epoch": 22.495274102079396, "grad_norm": 1.9694567918777466, "learning_rate": 5e-05, "loss": 0.5236, "step": 11900 }, { "epoch": 22.68431001890359, "grad_norm": 2.0226142406463623, "learning_rate": 5e-05, "loss": 0.5315, "step": 12000 }, { "epoch": 22.873345935727787, "grad_norm": 2.0204625129699707, "learning_rate": 5e-05, "loss": 0.5378, "step": 12100 }, { "epoch": 23.0, "eval_accuracy": 0.558974358974359, "eval_loss": 2.5709009170532227, "eval_runtime": 5.0788, "eval_samples_per_second": 98.449, "eval_steps_per_second": 12.405, "step": 12167 }, { "epoch": 23.0, "eval_exact_match": 16.4, "eval_f1": 23.69809523809524, "step": 12167 }, { "epoch": 23.062381852551987, "grad_norm": 2.0633811950683594, "learning_rate": 5e-05, "loss": 0.5183, "step": 12200 }, { "epoch": 23.251417769376182, "grad_norm": 2.3566079139709473, "learning_rate": 5e-05, "loss": 0.4761, "step": 12300 }, { "epoch": 23.440453686200378, "grad_norm": 2.368450880050659, "learning_rate": 5e-05, "loss": 0.487, "step": 12400 }, { "epoch": 23.629489603024574, "grad_norm": 2.1096372604370117, "learning_rate": 5e-05, "loss": 0.4945, "step": 12500 }, { "epoch": 23.81852551984877, "grad_norm": 2.2136454582214355, "learning_rate": 5e-05, "loss": 0.5035, "step": 12600 }, { "epoch": 24.0, "eval_accuracy": 0.5586153846153846, "eval_loss": 2.5898056030273438, "eval_runtime": 5.7458, "eval_samples_per_second": 87.021, "eval_steps_per_second": 10.965, "step": 12696 }, { "epoch": 24.0, "eval_exact_match": 17.8, "eval_f1": 24.89603174603175, "step": 12696 }, { "epoch": 24.00756143667297, "grad_norm": 2.317958116531372, "learning_rate": 5e-05, "loss": 0.5106, "step": 12700 }, { "epoch": 24.196597353497165, "grad_norm": 2.0783536434173584, "learning_rate": 5e-05, "loss": 0.4404, "step": 12800 }, { "epoch": 24.38563327032136, "grad_norm": 2.322791337966919, "learning_rate": 5e-05, "loss": 0.452, "step": 12900 }, { "epoch": 24.574669187145556, "grad_norm": 1.9763044118881226, "learning_rate": 5e-05, "loss": 0.4617, "step": 13000 }, { "epoch": 24.763705103969755, "grad_norm": 2.462392568588257, "learning_rate": 5e-05, "loss": 0.4731, "step": 13100 }, { "epoch": 24.95274102079395, "grad_norm": 2.2400307655334473, "learning_rate": 5e-05, "loss": 0.4751, "step": 13200 }, { "epoch": 25.0, "eval_accuracy": 0.5594871794871795, "eval_loss": 2.611313581466675, "eval_runtime": 5.3564, "eval_samples_per_second": 93.346, "eval_steps_per_second": 11.762, "step": 13225 }, { "epoch": 25.0, "eval_exact_match": 15.8, "eval_f1": 22.969047619047625, "step": 13225 }, { "epoch": 25.141776937618147, "grad_norm": 2.0931668281555176, "learning_rate": 5e-05, "loss": 0.4299, "step": 13300 }, { "epoch": 25.330812854442343, "grad_norm": 2.7056140899658203, "learning_rate": 5e-05, "loss": 0.422, "step": 13400 }, { "epoch": 25.519848771266542, "grad_norm": 1.9907532930374146, "learning_rate": 5e-05, "loss": 0.4374, "step": 13500 }, { "epoch": 25.708884688090738, "grad_norm": 2.3209221363067627, "learning_rate": 5e-05, "loss": 0.4376, "step": 13600 }, { "epoch": 25.897920604914933, "grad_norm": 3.2339212894439697, "learning_rate": 5e-05, "loss": 0.4432, "step": 13700 }, { "epoch": 26.0, "eval_accuracy": 0.5602051282051282, "eval_loss": 2.6182327270507812, "eval_runtime": 5.0321, "eval_samples_per_second": 99.362, "eval_steps_per_second": 12.52, "step": 13754 }, { "epoch": 26.0, "eval_exact_match": 15.4, "eval_f1": 23.059047619047618, "step": 13754 }, { "epoch": 26.08695652173913, "grad_norm": 1.9935250282287598, "learning_rate": 5e-05, "loss": 0.4265, "step": 13800 }, { "epoch": 26.27599243856333, "grad_norm": 2.5698649883270264, "learning_rate": 5e-05, "loss": 0.3949, "step": 13900 }, { "epoch": 26.465028355387524, "grad_norm": 2.568392038345337, "learning_rate": 5e-05, "loss": 0.4041, "step": 14000 }, { "epoch": 26.65406427221172, "grad_norm": 2.094651699066162, "learning_rate": 5e-05, "loss": 0.4128, "step": 14100 }, { "epoch": 26.843100189035916, "grad_norm": 2.240649700164795, "learning_rate": 5e-05, "loss": 0.4191, "step": 14200 }, { "epoch": 27.0, "eval_accuracy": 0.56, "eval_loss": 2.614851951599121, "eval_runtime": 5.636, "eval_samples_per_second": 88.715, "eval_steps_per_second": 11.178, "step": 14283 }, { "epoch": 27.0, "eval_exact_match": 17.2, "eval_f1": 24.941948051948046, "step": 14283 }, { "epoch": 27.032136105860115, "grad_norm": 1.8800567388534546, "learning_rate": 5e-05, "loss": 0.4135, "step": 14300 }, { "epoch": 27.22117202268431, "grad_norm": 2.525048017501831, "learning_rate": 5e-05, "loss": 0.3717, "step": 14400 }, { "epoch": 27.410207939508506, "grad_norm": 2.3803513050079346, "learning_rate": 5e-05, "loss": 0.3817, "step": 14500 }, { "epoch": 27.599243856332702, "grad_norm": 2.369356155395508, "learning_rate": 5e-05, "loss": 0.388, "step": 14600 }, { "epoch": 27.7882797731569, "grad_norm": 2.701702356338501, "learning_rate": 5e-05, "loss": 0.3943, "step": 14700 }, { "epoch": 27.977315689981097, "grad_norm": 2.2755825519561768, "learning_rate": 5e-05, "loss": 0.4015, "step": 14800 }, { "epoch": 28.0, "eval_accuracy": 0.5592820512820513, "eval_loss": 2.653672218322754, "eval_runtime": 5.3543, "eval_samples_per_second": 93.383, "eval_steps_per_second": 11.766, "step": 14812 }, { "epoch": 28.0, "eval_exact_match": 17.4, "eval_f1": 24.478888888888893, "step": 14812 }, { "epoch": 28.166351606805293, "grad_norm": 2.0458974838256836, "learning_rate": 5e-05, "loss": 0.3566, "step": 14900 }, { "epoch": 28.35538752362949, "grad_norm": 2.040117025375366, "learning_rate": 5e-05, "loss": 0.3602, "step": 15000 }, { "epoch": 28.544423440453688, "grad_norm": 1.947757363319397, "learning_rate": 5e-05, "loss": 0.3666, "step": 15100 }, { "epoch": 28.733459357277884, "grad_norm": 2.0905520915985107, "learning_rate": 5e-05, "loss": 0.3747, "step": 15200 }, { "epoch": 28.92249527410208, "grad_norm": 2.215851306915283, "learning_rate": 5e-05, "loss": 0.3798, "step": 15300 }, { "epoch": 29.0, "eval_accuracy": 0.5600512820512821, "eval_loss": 2.6304192543029785, "eval_runtime": 5.0359, "eval_samples_per_second": 99.288, "eval_steps_per_second": 12.51, "step": 15341 }, { "epoch": 29.0, "eval_exact_match": 16.2, "eval_f1": 23.909062049062054, "step": 15341 }, { "epoch": 29.111531190926275, "grad_norm": 2.5050926208496094, "learning_rate": 5e-05, "loss": 0.3538, "step": 15400 }, { "epoch": 29.300567107750474, "grad_norm": 2.0059566497802734, "learning_rate": 5e-05, "loss": 0.3365, "step": 15500 }, { "epoch": 29.48960302457467, "grad_norm": 2.7678000926971436, "learning_rate": 5e-05, "loss": 0.3523, "step": 15600 }, { "epoch": 29.678638941398866, "grad_norm": 2.6234450340270996, "learning_rate": 5e-05, "loss": 0.3552, "step": 15700 }, { "epoch": 29.86767485822306, "grad_norm": 1.9958972930908203, "learning_rate": 5e-05, "loss": 0.3629, "step": 15800 }, { "epoch": 30.0, "eval_accuracy": 0.56, "eval_loss": 2.6483516693115234, "eval_runtime": 5.0621, "eval_samples_per_second": 98.774, "eval_steps_per_second": 12.446, "step": 15870 }, { "epoch": 30.0, "eval_exact_match": 16.4, "eval_f1": 24.021746031746037, "step": 15870 }, { "epoch": 30.056710775047257, "grad_norm": 2.5568833351135254, "learning_rate": 5e-05, "loss": 0.3505, "step": 15900 }, { "epoch": 30.245746691871457, "grad_norm": 2.0313165187835693, "learning_rate": 5e-05, "loss": 0.3252, "step": 16000 }, { "epoch": 30.434782608695652, "grad_norm": 1.8152469396591187, "learning_rate": 5e-05, "loss": 0.335, "step": 16100 }, { "epoch": 30.623818525519848, "grad_norm": 2.0004210472106934, "learning_rate": 5e-05, "loss": 0.34, "step": 16200 }, { "epoch": 30.812854442344044, "grad_norm": 2.3558757305145264, "learning_rate": 5e-05, "loss": 0.3474, "step": 16300 }, { "epoch": 31.0, "eval_accuracy": 0.5605641025641026, "eval_loss": 2.6599507331848145, "eval_runtime": 5.3592, "eval_samples_per_second": 93.297, "eval_steps_per_second": 11.755, "step": 16399 }, { "epoch": 31.0, "eval_exact_match": 16.8, "eval_f1": 23.902857142857147, "step": 16399 }, { "epoch": 31.001890359168243, "grad_norm": 1.641440749168396, "learning_rate": 5e-05, "loss": 0.3478, "step": 16400 }, { "epoch": 31.19092627599244, "grad_norm": 2.425560712814331, "learning_rate": 5e-05, "loss": 0.3067, "step": 16500 }, { "epoch": 31.379962192816635, "grad_norm": 1.850221872329712, "learning_rate": 5e-05, "loss": 0.3202, "step": 16600 }, { "epoch": 31.56899810964083, "grad_norm": 1.4244951009750366, "learning_rate": 5e-05, "loss": 0.3301, "step": 16700 }, { "epoch": 31.75803402646503, "grad_norm": 2.093834638595581, "learning_rate": 5e-05, "loss": 0.3293, "step": 16800 }, { "epoch": 31.947069943289225, "grad_norm": 2.0703086853027344, "learning_rate": 5e-05, "loss": 0.3369, "step": 16900 }, { "epoch": 32.0, "eval_accuracy": 0.5597435897435897, "eval_loss": 2.6675195693969727, "eval_runtime": 5.1201, "eval_samples_per_second": 97.655, "eval_steps_per_second": 12.305, "step": 16928 }, { "epoch": 32.0, "eval_exact_match": 15.6, "eval_f1": 22.95126984126984, "step": 16928 }, { "epoch": 32.136105860113425, "grad_norm": 2.0121419429779053, "learning_rate": 5e-05, "loss": 0.3134, "step": 17000 }, { "epoch": 32.32514177693762, "grad_norm": 1.8783822059631348, "learning_rate": 5e-05, "loss": 0.3102, "step": 17100 }, { "epoch": 32.514177693761816, "grad_norm": 1.5319064855575562, "learning_rate": 5e-05, "loss": 0.3089, "step": 17200 }, { "epoch": 32.70321361058601, "grad_norm": 2.6580235958099365, "learning_rate": 5e-05, "loss": 0.3175, "step": 17300 }, { "epoch": 32.89224952741021, "grad_norm": 2.121335744857788, "learning_rate": 5e-05, "loss": 0.3275, "step": 17400 }, { "epoch": 33.0, "eval_accuracy": 0.5603589743589743, "eval_loss": 2.658357858657837, "eval_runtime": 5.4806, "eval_samples_per_second": 91.231, "eval_steps_per_second": 11.495, "step": 17457 }, { "epoch": 33.0, "eval_exact_match": 16.4, "eval_f1": 24.0452380952381, "step": 17457 }, { "epoch": 33.0812854442344, "grad_norm": 2.1282331943511963, "learning_rate": 5e-05, "loss": 0.311, "step": 17500 }, { "epoch": 33.2703213610586, "grad_norm": 2.212125062942505, "learning_rate": 5e-05, "loss": 0.2951, "step": 17600 }, { "epoch": 33.459357277882795, "grad_norm": 3.389835834503174, "learning_rate": 5e-05, "loss": 0.3068, "step": 17700 }, { "epoch": 33.648393194707, "grad_norm": 1.812578558921814, "learning_rate": 5e-05, "loss": 0.3091, "step": 17800 }, { "epoch": 33.83742911153119, "grad_norm": 2.434966802597046, "learning_rate": 5e-05, "loss": 0.311, "step": 17900 }, { "epoch": 34.0, "eval_accuracy": 0.5600512820512821, "eval_loss": 2.676753520965576, "eval_runtime": 5.3579, "eval_samples_per_second": 93.32, "eval_steps_per_second": 11.758, "step": 17986 }, { "epoch": 34.0, "eval_exact_match": 16.6, "eval_f1": 24.913650793650795, "step": 17986 }, { "epoch": 34.02646502835539, "grad_norm": 1.5352602005004883, "learning_rate": 5e-05, "loss": 0.3142, "step": 18000 }, { "epoch": 34.215500945179585, "grad_norm": 1.4355374574661255, "learning_rate": 5e-05, "loss": 0.2897, "step": 18100 }, { "epoch": 34.40453686200378, "grad_norm": 1.7993170022964478, "learning_rate": 5e-05, "loss": 0.2923, "step": 18200 }, { "epoch": 34.593572778827976, "grad_norm": 1.9478981494903564, "learning_rate": 5e-05, "loss": 0.3001, "step": 18300 }, { "epoch": 34.78260869565217, "grad_norm": 3.045933246612549, "learning_rate": 5e-05, "loss": 0.3034, "step": 18400 }, { "epoch": 34.97164461247637, "grad_norm": 1.8796017169952393, "learning_rate": 5e-05, "loss": 0.31, "step": 18500 }, { "epoch": 35.0, "eval_accuracy": 0.5601538461538461, "eval_loss": 2.6845180988311768, "eval_runtime": 5.7492, "eval_samples_per_second": 86.969, "eval_steps_per_second": 10.958, "step": 18515 }, { "epoch": 35.0, "eval_exact_match": 16.2, "eval_f1": 23.91369408369409, "step": 18515 }, { "epoch": 35.16068052930057, "grad_norm": 1.7619277238845825, "learning_rate": 5e-05, "loss": 0.2853, "step": 18600 }, { "epoch": 35.349716446124766, "grad_norm": 1.7933720350265503, "learning_rate": 5e-05, "loss": 0.2853, "step": 18700 }, { "epoch": 35.53875236294896, "grad_norm": 1.476181149482727, "learning_rate": 5e-05, "loss": 0.2955, "step": 18800 }, { "epoch": 35.72778827977316, "grad_norm": 1.6427425146102905, "learning_rate": 5e-05, "loss": 0.2934, "step": 18900 }, { "epoch": 35.916824196597354, "grad_norm": 1.7125171422958374, "learning_rate": 5e-05, "loss": 0.3009, "step": 19000 }, { "epoch": 36.0, "eval_accuracy": 0.5604615384615385, "eval_loss": 2.6663565635681152, "eval_runtime": 5.0382, "eval_samples_per_second": 99.241, "eval_steps_per_second": 12.504, "step": 19044 }, { "epoch": 36.0, "eval_exact_match": 16.0, "eval_f1": 23.221269841269848, "step": 19044 }, { "epoch": 36.10586011342155, "grad_norm": 1.757238507270813, "learning_rate": 5e-05, "loss": 0.286, "step": 19100 }, { "epoch": 36.294896030245745, "grad_norm": 1.9411671161651611, "learning_rate": 5e-05, "loss": 0.2761, "step": 19200 }, { "epoch": 36.48393194706994, "grad_norm": 2.305509328842163, "learning_rate": 5e-05, "loss": 0.2873, "step": 19300 }, { "epoch": 36.67296786389414, "grad_norm": 1.6525070667266846, "learning_rate": 5e-05, "loss": 0.2881, "step": 19400 }, { "epoch": 36.86200378071834, "grad_norm": 1.562827706336975, "learning_rate": 5e-05, "loss": 0.2959, "step": 19500 }, { "epoch": 37.0, "eval_accuracy": 0.5594871794871795, "eval_loss": 2.738389492034912, "eval_runtime": 5.0296, "eval_samples_per_second": 99.412, "eval_steps_per_second": 12.526, "step": 19573 }, { "epoch": 37.0, "eval_exact_match": 16.0, "eval_f1": 23.01206349206349, "step": 19573 }, { "epoch": 37.051039697542535, "grad_norm": 2.588029146194458, "learning_rate": 5e-05, "loss": 0.2885, "step": 19600 }, { "epoch": 37.24007561436673, "grad_norm": 2.2393579483032227, "learning_rate": 5e-05, "loss": 0.267, "step": 19700 }, { "epoch": 37.42911153119093, "grad_norm": 1.5349841117858887, "learning_rate": 5e-05, "loss": 0.2787, "step": 19800 }, { "epoch": 37.61814744801512, "grad_norm": 1.8498040437698364, "learning_rate": 5e-05, "loss": 0.2873, "step": 19900 }, { "epoch": 37.80718336483932, "grad_norm": 1.9959454536437988, "learning_rate": 5e-05, "loss": 0.2879, "step": 20000 }, { "epoch": 37.996219281663514, "grad_norm": 1.814253330230713, "learning_rate": 5e-05, "loss": 0.2927, "step": 20100 }, { "epoch": 38.0, "eval_accuracy": 0.5592820512820513, "eval_loss": 2.731750965118408, "eval_runtime": 5.032, "eval_samples_per_second": 99.364, "eval_steps_per_second": 12.52, "step": 20102 }, { "epoch": 38.0, "eval_exact_match": 16.4, "eval_f1": 23.767460317460323, "step": 20102 } ], "logging_steps": 100, "max_steps": 26450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2383494727280886e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }