{ "best_metric": null, "best_model_checkpoint": null, "epoch": 36.0, "eval_steps": 500, "global_step": 19044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.5558584332466125, "learning_rate": 3e-05, "loss": 2.1103, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.4733140468597412, "learning_rate": 3e-05, "loss": 1.931, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.48580050468444824, "learning_rate": 3e-05, "loss": 1.8948, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.40655234456062317, "learning_rate": 3e-05, "loss": 1.8988, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.6303955316543579, "learning_rate": 3e-05, "loss": 1.8745, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.5701538461538461, "eval_loss": 1.6455246210098267, "eval_runtime": 5.0805, "eval_samples_per_second": 98.415, "eval_steps_per_second": 12.4, "step": 529 }, { "epoch": 1.0, "eval_exact_match": 15.0, "eval_f1": 23.927619047619068, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.4278818666934967, "learning_rate": 3e-05, "loss": 1.8798, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.449830025434494, "learning_rate": 3e-05, "loss": 1.8613, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5130725502967834, "learning_rate": 3e-05, "loss": 1.8709, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.4889715611934662, "learning_rate": 3e-05, "loss": 1.8448, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 11.186380386352539, "learning_rate": 3e-05, "loss": 1.8764, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.5723589743589743, "eval_loss": 1.6143561601638794, "eval_runtime": 5.0466, "eval_samples_per_second": 99.077, "eval_steps_per_second": 12.484, "step": 1058 }, { "epoch": 2.0, "eval_exact_match": 15.6, "eval_f1": 25.825238095238113, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.5414931774139404, "learning_rate": 3e-05, "loss": 1.8398, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 7.441005706787109, "learning_rate": 3e-05, "loss": 1.8348, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 24.312145233154297, "learning_rate": 3e-05, "loss": 1.8421, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.7407662868499756, "learning_rate": 3e-05, "loss": 1.8292, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.9653642177581787, "learning_rate": 3e-05, "loss": 1.828, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.5734358974358974, "eval_loss": 1.617864727973938, "eval_runtime": 5.2153, "eval_samples_per_second": 95.872, "eval_steps_per_second": 12.08, "step": 1587 }, { "epoch": 3.0, "eval_exact_match": 16.2, "eval_f1": 25.811428571428586, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 450.6055603027344, "learning_rate": 3e-05, "loss": 1.8204, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 2.1127028465270996, "learning_rate": 3e-05, "loss": 1.8361, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 564.9320068359375, "learning_rate": 3e-05, "loss": 1.8338, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 52.5078010559082, "learning_rate": 3e-05, "loss": 1.8075, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 27.87557601928711, "learning_rate": 3e-05, "loss": 1.8175, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 4.810656547546387, "learning_rate": 3e-05, "loss": 1.8218, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.5728205128205128, "eval_loss": 1.6224406957626343, "eval_runtime": 5.0588, "eval_samples_per_second": 98.838, "eval_steps_per_second": 12.454, "step": 2116 }, { "epoch": 4.0, "eval_exact_match": 17.4, "eval_f1": 27.194285714285726, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 13.004076957702637, "learning_rate": 3e-05, "loss": 1.8089, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 2.23368501663208, "learning_rate": 3e-05, "loss": 1.8292, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 0.8635586500167847, "learning_rate": 3e-05, "loss": 1.7992, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 6.5326738357543945, "learning_rate": 3e-05, "loss": 1.7955, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 463.4647521972656, "learning_rate": 3e-05, "loss": 1.7953, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.5740512820512821, "eval_loss": 1.6217337846755981, "eval_runtime": 5.1898, "eval_samples_per_second": 96.342, "eval_steps_per_second": 12.139, "step": 2645 }, { "epoch": 5.0, "eval_exact_match": 17.6, "eval_f1": 26.161428571428587, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 203.82815551757812, "learning_rate": 3e-05, "loss": 1.7845, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 660.8076782226562, "learning_rate": 3e-05, "loss": 1.7909, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 5727.51806640625, "learning_rate": 3e-05, "loss": 1.7909, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 4880698.0, "learning_rate": 3e-05, "loss": 3.5781, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1192.6517333984375, "learning_rate": 3e-05, "loss": 3.8703, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.5433846153846154, "eval_loss": 1.886108636856079, "eval_runtime": 5.5162, "eval_samples_per_second": 90.643, "eval_steps_per_second": 11.421, "step": 3174 }, { "epoch": 6.0, "eval_exact_match": 11.4, "eval_f1": 20.358571428571437, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 532.3778686523438, "learning_rate": 3e-05, "loss": 2.1701, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 565.6697998046875, "learning_rate": 3e-05, "loss": 1.8533, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 25206.482421875, "learning_rate": 3e-05, "loss": 2.623, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 71.88556671142578, "learning_rate": 3e-05, "loss": 3.1573, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 269563.15625, "learning_rate": 3e-05, "loss": 2.2696, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 11768.7880859375, "learning_rate": 3e-05, "loss": 3.1506, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.2834871794871795, "eval_loss": 8.311681747436523, "eval_runtime": 5.3223, "eval_samples_per_second": 93.944, "eval_steps_per_second": 11.837, "step": 3703 }, { "epoch": 7.0, "eval_exact_match": 0.0, "eval_f1": 0.027458851580988985, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 6220.75244140625, "learning_rate": 3e-05, "loss": 4.273, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 3387.953857421875, "learning_rate": 3e-05, "loss": 7.7199, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 372.8681640625, "learning_rate": 3e-05, "loss": 7.7045, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 159.64283752441406, "learning_rate": 3e-05, "loss": 6.8901, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 751876.125, "learning_rate": 3e-05, "loss": 6.5238, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.22764102564102565, "eval_loss": 7.12347936630249, "eval_runtime": 5.3818, "eval_samples_per_second": 92.905, "eval_steps_per_second": 11.706, "step": 4232 }, { "epoch": 8.0, "eval_exact_match": 0.0, "eval_f1": 0.013333333333333336, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 1035664128.0, "learning_rate": 3e-05, "loss": 5.4157, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 11571242.0, "learning_rate": 3e-05, "loss": 5.8257, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 88236256.0, "learning_rate": 3e-05, "loss": 6.3609, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 105426080.0, "learning_rate": 3e-05, "loss": 6.1824, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 51438844.0, "learning_rate": 3e-05, "loss": 6.1818, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 11.187646865844727, "eval_runtime": 5.154, "eval_samples_per_second": 97.013, "eval_steps_per_second": 12.224, "step": 4761 }, { "epoch": 9.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 8742571.0, "learning_rate": 3e-05, "loss": 7.5029, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 4885384192.0, "learning_rate": 3e-05, "loss": 7.805, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 919074304.0, "learning_rate": 3e-05, "loss": 8.0712, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 9706883072.0, "learning_rate": 3e-05, "loss": 8.4915, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 35723780096.0, "learning_rate": 3e-05, "loss": 8.3286, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 13.374640464782715, "eval_runtime": 5.2089, "eval_samples_per_second": 95.99, "eval_steps_per_second": 12.095, "step": 5290 }, { "epoch": 10.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 172018032.0, "learning_rate": 3e-05, "loss": 8.4403, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 329336992.0, "learning_rate": 3e-05, "loss": 8.3233, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 74210560.0, "learning_rate": 3e-05, "loss": 8.3673, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 497968480.0, "learning_rate": 3e-05, "loss": 9.8311, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 34138104.0, "learning_rate": 3e-05, "loss": 10.8161, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 308232992.0, "learning_rate": 3e-05, "loss": 10.2827, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 12.878591537475586, "eval_runtime": 5.1038, "eval_samples_per_second": 97.966, "eval_steps_per_second": 12.344, "step": 5819 }, { "epoch": 11.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 255807824.0, "learning_rate": 3e-05, "loss": 9.8782, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 27976615936.0, "learning_rate": 3e-05, "loss": 9.879, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 13783190732800.0, "learning_rate": 3e-05, "loss": 9.8617, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 3467661017088.0, "learning_rate": 3e-05, "loss": 10.1028, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 333752.3125, "learning_rate": 3e-05, "loss": 9.4972, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 13.161654472351074, "eval_runtime": 5.0927, "eval_samples_per_second": 98.18, "eval_steps_per_second": 12.371, "step": 6348 }, { "epoch": 12.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 6690696704.0, "learning_rate": 3e-05, "loss": 9.2939, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 148906835968.0, "learning_rate": 3e-05, "loss": 9.3499, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 15515338752.0, "learning_rate": 3e-05, "loss": 9.3496, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 76318760960.0, "learning_rate": 3e-05, "loss": 9.2728, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 85637283840.0, "learning_rate": 3e-05, "loss": 9.4453, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 12.748088836669922, "eval_runtime": 5.1332, "eval_samples_per_second": 97.405, "eval_steps_per_second": 12.273, "step": 6877 }, { "epoch": 13.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 370487552.0, "learning_rate": 3e-05, "loss": 9.8639, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 243782.78125, "learning_rate": 3e-05, "loss": 9.2996, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 572714688.0, "learning_rate": 3e-05, "loss": 9.0366, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 807026112.0, "learning_rate": 3e-05, "loss": 8.9777, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 401406112.0, "learning_rate": 3e-05, "loss": 10.3094, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 20012445696.0, "learning_rate": 3e-05, "loss": 11.1388, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 11.845151901245117, "eval_runtime": 5.7887, "eval_samples_per_second": 86.376, "eval_steps_per_second": 10.883, "step": 7406 }, { "epoch": 14.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 61066740.0, "learning_rate": 3e-05, "loss": 9.4144, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 58445844.0, "learning_rate": 3e-05, "loss": 10.1294, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 40712032.0, "learning_rate": 3e-05, "loss": 12.1512, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 118321208.0, "learning_rate": 3e-05, "loss": 10.8949, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 61603804.0, "learning_rate": 3e-05, "loss": 10.849, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 14.529706954956055, "eval_runtime": 5.7662, "eval_samples_per_second": 86.712, "eval_steps_per_second": 10.926, "step": 7935 }, { "epoch": 15.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 1755613568.0, "learning_rate": 3e-05, "loss": 10.7302, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 1215193088.0, "learning_rate": 3e-05, "loss": 10.1428, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 40452497408.0, "learning_rate": 3e-05, "loss": 10.0763, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 10178840576.0, "learning_rate": 3e-05, "loss": 10.2204, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 10069305344.0, "learning_rate": 3e-05, "loss": 9.9088, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.22523076923076923, "eval_loss": 14.802911758422852, "eval_runtime": 5.0764, "eval_samples_per_second": 98.494, "eval_steps_per_second": 12.41, "step": 8464 }, { "epoch": 16.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 9244783616.0, "learning_rate": 3e-05, "loss": 10.2574, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 556177664.0, "learning_rate": 3e-05, "loss": 10.6479, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 44021864.0, "learning_rate": 3e-05, "loss": 11.8164, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 17835092.0, "learning_rate": 3e-05, "loss": 12.391, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 39907684.0, "learning_rate": 3e-05, "loss": 12.3715, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 15.876049995422363, "eval_runtime": 5.1121, "eval_samples_per_second": 97.807, "eval_steps_per_second": 12.324, "step": 8993 }, { "epoch": 17.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 5489768.0, "learning_rate": 3e-05, "loss": 12.9394, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 552121280.0, "learning_rate": 3e-05, "loss": 15.0591, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 4284707.5, "learning_rate": 3e-05, "loss": 14.7709, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 32860224.0, "learning_rate": 3e-05, "loss": 15.33, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 500232672.0, "learning_rate": 3e-05, "loss": 15.2314, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 138954320.0, "learning_rate": 3e-05, "loss": 14.0399, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 15.959660530090332, "eval_runtime": 5.4552, "eval_samples_per_second": 91.656, "eval_steps_per_second": 11.549, "step": 9522 }, { "epoch": 18.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 17529152.0, "learning_rate": 3e-05, "loss": 15.3582, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 14452664.0, "learning_rate": 3e-05, "loss": 16.0595, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 9175341.0, "learning_rate": 3e-05, "loss": 15.2152, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 3109339.75, "learning_rate": 3e-05, "loss": 15.6083, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 1141542.125, "learning_rate": 3e-05, "loss": 15.6927, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 16.572154998779297, "eval_runtime": 5.784, "eval_samples_per_second": 86.445, "eval_steps_per_second": 10.892, "step": 10051 }, { "epoch": 19.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 3051544.0, "learning_rate": 3e-05, "loss": 15.4752, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 32653528.0, "learning_rate": 3e-05, "loss": 15.3329, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 373167.34375, "learning_rate": 3e-05, "loss": 15.1992, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 761316507648.0, "learning_rate": 3e-05, "loss": 14.5326, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 15194965.0, "learning_rate": 3e-05, "loss": 14.6703, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.2026153846153846, "eval_loss": 12.01396656036377, "eval_runtime": 5.164, "eval_samples_per_second": 96.824, "eval_steps_per_second": 12.2, "step": 10580 }, { "epoch": 20.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 10580 }, { "epoch": 20.03780718336484, "grad_norm": 1443913.625, "learning_rate": 3e-05, "loss": 13.4546, "step": 10600 }, { "epoch": 20.226843100189036, "grad_norm": 38678328.0, "learning_rate": 3e-05, "loss": 15.1074, "step": 10700 }, { "epoch": 20.415879017013232, "grad_norm": 235378832.0, "learning_rate": 3e-05, "loss": 15.8794, "step": 10800 }, { "epoch": 20.604914933837428, "grad_norm": 7457460.5, "learning_rate": 3e-05, "loss": 15.9728, "step": 10900 }, { "epoch": 20.793950850661627, "grad_norm": 360401824.0, "learning_rate": 3e-05, "loss": 16.3397, "step": 11000 }, { "epoch": 20.982986767485823, "grad_norm": 5285015.0, "learning_rate": 3e-05, "loss": 16.1349, "step": 11100 }, { "epoch": 21.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 16.624380111694336, "eval_runtime": 5.0669, "eval_samples_per_second": 98.68, "eval_steps_per_second": 12.434, "step": 11109 }, { "epoch": 21.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 11109 }, { "epoch": 21.17202268431002, "grad_norm": 3120185999360.0, "learning_rate": 3e-05, "loss": 15.2325, "step": 11200 }, { "epoch": 21.361058601134214, "grad_norm": 459053662208.0, "learning_rate": 3e-05, "loss": 14.9804, "step": 11300 }, { "epoch": 21.550094517958414, "grad_norm": 33530312704.0, "learning_rate": 3e-05, "loss": 13.4482, "step": 11400 }, { "epoch": 21.73913043478261, "grad_norm": 524117879029760.0, "learning_rate": 3e-05, "loss": 12.8065, "step": 11500 }, { "epoch": 21.928166351606805, "grad_norm": 73415352385536.0, "learning_rate": 3e-05, "loss": 13.1619, "step": 11600 }, { "epoch": 22.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 15.610301971435547, "eval_runtime": 5.8348, "eval_samples_per_second": 85.693, "eval_steps_per_second": 10.797, "step": 11638 }, { "epoch": 22.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 11638 }, { "epoch": 22.117202268431, "grad_norm": 24663130374144.0, "learning_rate": 3e-05, "loss": 13.4567, "step": 11700 }, { "epoch": 22.3062381852552, "grad_norm": 1152798.75, "learning_rate": 3e-05, "loss": 13.3323, "step": 11800 }, { "epoch": 22.495274102079396, "grad_norm": 6407195.0, "learning_rate": 3e-05, "loss": 12.224, "step": 11900 }, { "epoch": 22.68431001890359, "grad_norm": 3026044.75, "learning_rate": 3e-05, "loss": 11.6534, "step": 12000 }, { "epoch": 22.873345935727787, "grad_norm": 433589.25, "learning_rate": 3e-05, "loss": 11.176, "step": 12100 }, { "epoch": 23.0, "eval_accuracy": 0.20005128205128206, "eval_loss": 9.37710952758789, "eval_runtime": 5.4109, "eval_samples_per_second": 92.407, "eval_steps_per_second": 11.643, "step": 12167 }, { "epoch": 23.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 12167 }, { "epoch": 23.062381852551987, "grad_norm": 221080977408.0, "learning_rate": 3e-05, "loss": 10.2085, "step": 12200 }, { "epoch": 23.251417769376182, "grad_norm": 819255836672.0, "learning_rate": 3e-05, "loss": 10.3606, "step": 12300 }, { "epoch": 23.440453686200378, "grad_norm": 1077065216.0, "learning_rate": 3e-05, "loss": 10.3544, "step": 12400 }, { "epoch": 23.629489603024574, "grad_norm": 147283214336.0, "learning_rate": 3e-05, "loss": 10.6161, "step": 12500 }, { "epoch": 23.81852551984877, "grad_norm": 479103936.0, "learning_rate": 3e-05, "loss": 11.0412, "step": 12600 }, { "epoch": 24.0, "eval_accuracy": 0.2362051282051282, "eval_loss": 8.926570892333984, "eval_runtime": 5.7624, "eval_samples_per_second": 86.769, "eval_steps_per_second": 10.933, "step": 12696 }, { "epoch": 24.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 12696 }, { "epoch": 24.00756143667297, "grad_norm": 18736.490234375, "learning_rate": 3e-05, "loss": 10.5723, "step": 12700 }, { "epoch": 24.196597353497165, "grad_norm": 10122.208984375, "learning_rate": 3e-05, "loss": 9.1505, "step": 12800 }, { "epoch": 24.38563327032136, "grad_norm": 884941.875, "learning_rate": 3e-05, "loss": 8.6122, "step": 12900 }, { "epoch": 24.574669187145556, "grad_norm": 51996.421875, "learning_rate": 3e-05, "loss": 8.5405, "step": 13000 }, { "epoch": 24.763705103969755, "grad_norm": 11833.41796875, "learning_rate": 3e-05, "loss": 8.4026, "step": 13100 }, { "epoch": 24.95274102079395, "grad_norm": 645689536.0, "learning_rate": 3e-05, "loss": 8.8444, "step": 13200 }, { "epoch": 25.0, "eval_accuracy": 0.20235897435897435, "eval_loss": 10.400785446166992, "eval_runtime": 5.8159, "eval_samples_per_second": 85.971, "eval_steps_per_second": 10.832, "step": 13225 }, { "epoch": 25.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 13225 }, { "epoch": 25.141776937618147, "grad_norm": 33455.7734375, "learning_rate": 3e-05, "loss": 8.9314, "step": 13300 }, { "epoch": 25.330812854442343, "grad_norm": 665939.625, "learning_rate": 3e-05, "loss": 8.9419, "step": 13400 }, { "epoch": 25.519848771266542, "grad_norm": 366503.25, "learning_rate": 3e-05, "loss": 9.1085, "step": 13500 }, { "epoch": 25.708884688090738, "grad_norm": 1146194755584.0, "learning_rate": 3e-05, "loss": 9.1301, "step": 13600 }, { "epoch": 25.897920604914933, "grad_norm": 5115191230464.0, "learning_rate": 3e-05, "loss": 8.9435, "step": 13700 }, { "epoch": 26.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 12.846183776855469, "eval_runtime": 5.1674, "eval_samples_per_second": 96.76, "eval_steps_per_second": 12.192, "step": 13754 }, { "epoch": 26.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 13754 }, { "epoch": 26.08695652173913, "grad_norm": 119.87647247314453, "learning_rate": 3e-05, "loss": 10.1216, "step": 13800 }, { "epoch": 26.27599243856333, "grad_norm": 5832.49462890625, "learning_rate": 3e-05, "loss": 8.5386, "step": 13900 }, { "epoch": 26.465028355387524, "grad_norm": 7308.30908203125, "learning_rate": 3e-05, "loss": 8.1093, "step": 14000 }, { "epoch": 26.65406427221172, "grad_norm": 21071.32421875, "learning_rate": 3e-05, "loss": 8.0342, "step": 14100 }, { "epoch": 26.843100189035916, "grad_norm": 69589.0625, "learning_rate": 3e-05, "loss": 8.0313, "step": 14200 }, { "epoch": 27.0, "eval_accuracy": 0.23364102564102565, "eval_loss": 7.521541595458984, "eval_runtime": 5.0339, "eval_samples_per_second": 99.326, "eval_steps_per_second": 12.515, "step": 14283 }, { "epoch": 27.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 14283 }, { "epoch": 27.032136105860115, "grad_norm": 158736.625, "learning_rate": 3e-05, "loss": 8.1019, "step": 14300 }, { "epoch": 27.22117202268431, "grad_norm": 12491.8095703125, "learning_rate": 3e-05, "loss": 8.2181, "step": 14400 }, { "epoch": 27.410207939508506, "grad_norm": 123181.3671875, "learning_rate": 3e-05, "loss": 8.3091, "step": 14500 }, { "epoch": 27.599243856332702, "grad_norm": 620153.5, "learning_rate": 3e-05, "loss": 8.4058, "step": 14600 }, { "epoch": 27.7882797731569, "grad_norm": 337683.1875, "learning_rate": 3e-05, "loss": 8.619, "step": 14700 }, { "epoch": 27.977315689981097, "grad_norm": 5629.42431640625, "learning_rate": 3e-05, "loss": 8.3923, "step": 14800 }, { "epoch": 28.0, "eval_accuracy": 0.21205128205128204, "eval_loss": 7.625494003295898, "eval_runtime": 5.1068, "eval_samples_per_second": 97.909, "eval_steps_per_second": 12.336, "step": 14812 }, { "epoch": 28.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 14812 }, { "epoch": 28.166351606805293, "grad_norm": 779905.0625, "learning_rate": 3e-05, "loss": 8.2129, "step": 14900 }, { "epoch": 28.35538752362949, "grad_norm": 92020.96875, "learning_rate": 3e-05, "loss": 8.3803, "step": 15000 }, { "epoch": 28.544423440453688, "grad_norm": 1754453.125, "learning_rate": 3e-05, "loss": 10.3285, "step": 15100 }, { "epoch": 28.733459357277884, "grad_norm": 9245.0390625, "learning_rate": 3e-05, "loss": 10.3239, "step": 15200 }, { "epoch": 28.92249527410208, "grad_norm": 57860676.0, "learning_rate": 3e-05, "loss": 8.5368, "step": 15300 }, { "epoch": 29.0, "eval_accuracy": 0.1995897435897436, "eval_loss": 9.528727531433105, "eval_runtime": 5.1078, "eval_samples_per_second": 97.89, "eval_steps_per_second": 12.334, "step": 15341 }, { "epoch": 29.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 15341 }, { "epoch": 29.111531190926275, "grad_norm": 95621760.0, "learning_rate": 3e-05, "loss": 9.2102, "step": 15400 }, { "epoch": 29.300567107750474, "grad_norm": 68808096.0, "learning_rate": 3e-05, "loss": 9.1584, "step": 15500 }, { "epoch": 29.48960302457467, "grad_norm": 394869760.0, "learning_rate": 3e-05, "loss": 9.0096, "step": 15600 }, { "epoch": 29.678638941398866, "grad_norm": 295963264.0, "learning_rate": 3e-05, "loss": 8.1208, "step": 15700 }, { "epoch": 29.86767485822306, "grad_norm": 471883200.0, "learning_rate": 3e-05, "loss": 7.964, "step": 15800 }, { "epoch": 30.0, "eval_accuracy": 0.19994871794871794, "eval_loss": 7.713045120239258, "eval_runtime": 5.3921, "eval_samples_per_second": 92.728, "eval_steps_per_second": 11.684, "step": 15870 }, { "epoch": 30.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 15870 }, { "epoch": 30.056710775047257, "grad_norm": 1943886.25, "learning_rate": 3e-05, "loss": 8.051, "step": 15900 }, { "epoch": 30.245746691871457, "grad_norm": 252297.234375, "learning_rate": 3e-05, "loss": 8.2152, "step": 16000 }, { "epoch": 30.434782608695652, "grad_norm": 20118.697265625, "learning_rate": 3e-05, "loss": 8.586, "step": 16100 }, { "epoch": 30.623818525519848, "grad_norm": 30278450.0, "learning_rate": 3e-05, "loss": 8.594, "step": 16200 }, { "epoch": 30.812854442344044, "grad_norm": 491771.5, "learning_rate": 3e-05, "loss": 8.861, "step": 16300 }, { "epoch": 31.0, "eval_accuracy": 0.2362051282051282, "eval_loss": 7.903472423553467, "eval_runtime": 5.622, "eval_samples_per_second": 88.937, "eval_steps_per_second": 11.206, "step": 16399 }, { "epoch": 31.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 16399 }, { "epoch": 31.001890359168243, "grad_norm": 67330.3125, "learning_rate": 3e-05, "loss": 8.3547, "step": 16400 }, { "epoch": 31.19092627599244, "grad_norm": 437164.03125, "learning_rate": 3e-05, "loss": 9.0483, "step": 16500 }, { "epoch": 31.379962192816635, "grad_norm": 383445.78125, "learning_rate": 3e-05, "loss": 8.8258, "step": 16600 }, { "epoch": 31.56899810964083, "grad_norm": 26452162510848.0, "learning_rate": 3e-05, "loss": 9.2178, "step": 16700 }, { "epoch": 31.75803402646503, "grad_norm": 1477470060544.0, "learning_rate": 3e-05, "loss": 9.0897, "step": 16800 }, { "epoch": 31.947069943289225, "grad_norm": 3547858688.0, "learning_rate": 3e-05, "loss": 9.1201, "step": 16900 }, { "epoch": 32.0, "eval_accuracy": 0.2362051282051282, "eval_loss": 8.878704071044922, "eval_runtime": 5.7948, "eval_samples_per_second": 86.284, "eval_steps_per_second": 10.872, "step": 16928 }, { "epoch": 32.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 16928 }, { "epoch": 32.136105860113425, "grad_norm": 25736851226624.0, "learning_rate": 3e-05, "loss": 9.0721, "step": 17000 }, { "epoch": 32.32514177693762, "grad_norm": 83071770624.0, "learning_rate": 3e-05, "loss": 9.1184, "step": 17100 }, { "epoch": 32.514177693761816, "grad_norm": 362845667328.0, "learning_rate": 3e-05, "loss": 9.1277, "step": 17200 }, { "epoch": 32.70321361058601, "grad_norm": 6446509981696.0, "learning_rate": 3e-05, "loss": 9.1827, "step": 17300 }, { "epoch": 32.89224952741021, "grad_norm": 8676001710080.0, "learning_rate": 3e-05, "loss": 9.1903, "step": 17400 }, { "epoch": 33.0, "eval_accuracy": 0.2003076923076923, "eval_loss": 12.143142700195312, "eval_runtime": 5.0907, "eval_samples_per_second": 98.218, "eval_steps_per_second": 12.375, "step": 17457 }, { "epoch": 33.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 17457 }, { "epoch": 33.0812854442344, "grad_norm": 2477152272384.0, "learning_rate": 3e-05, "loss": 10.3355, "step": 17500 }, { "epoch": 33.2703213610586, "grad_norm": 1226347118592.0, "learning_rate": 3e-05, "loss": 9.7714, "step": 17600 }, { "epoch": 33.459357277882795, "grad_norm": 6838296248320.0, "learning_rate": 3e-05, "loss": 9.9791, "step": 17700 }, { "epoch": 33.648393194707, "grad_norm": 1059247161344.0, "learning_rate": 3e-05, "loss": 8.9139, "step": 17800 }, { "epoch": 33.83742911153119, "grad_norm": 783651008.0, "learning_rate": 3e-05, "loss": 8.6886, "step": 17900 }, { "epoch": 34.0, "eval_accuracy": 0.2073846153846154, "eval_loss": 8.974021911621094, "eval_runtime": 5.0381, "eval_samples_per_second": 99.245, "eval_steps_per_second": 12.505, "step": 17986 }, { "epoch": 34.0, "eval_exact_match": 0.0, "eval_f1": 0.048600856756742646, "step": 17986 }, { "epoch": 34.02646502835539, "grad_norm": 950630948536320.0, "learning_rate": 3e-05, "loss": 9.4673, "step": 18000 }, { "epoch": 34.215500945179585, "grad_norm": 450093280.0, "learning_rate": 3e-05, "loss": 8.886, "step": 18100 }, { "epoch": 34.40453686200378, "grad_norm": 87242632.0, "learning_rate": 3e-05, "loss": 10.0368, "step": 18200 }, { "epoch": 34.593572778827976, "grad_norm": 57701945344.0, "learning_rate": 3e-05, "loss": 9.5221, "step": 18300 }, { "epoch": 34.78260869565217, "grad_norm": 1806314112.0, "learning_rate": 3e-05, "loss": 9.5085, "step": 18400 }, { "epoch": 34.97164461247637, "grad_norm": 2088303.0, "learning_rate": 3e-05, "loss": 9.166, "step": 18500 }, { "epoch": 35.0, "eval_accuracy": 0.22430769230769232, "eval_loss": 8.694220542907715, "eval_runtime": 6.1359, "eval_samples_per_second": 81.487, "eval_steps_per_second": 10.267, "step": 18515 }, { "epoch": 35.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 18515 }, { "epoch": 35.16068052930057, "grad_norm": 993076051968.0, "learning_rate": 3e-05, "loss": 9.1192, "step": 18600 }, { "epoch": 35.349716446124766, "grad_norm": 126950965248.0, "learning_rate": 3e-05, "loss": 8.607, "step": 18700 }, { "epoch": 35.53875236294896, "grad_norm": 816930160640.0, "learning_rate": 3e-05, "loss": 8.1998, "step": 18800 }, { "epoch": 35.72778827977316, "grad_norm": 80379322368.0, "learning_rate": 3e-05, "loss": 9.6685, "step": 18900 }, { "epoch": 35.916824196597354, "grad_norm": 34520543232.0, "learning_rate": 3e-05, "loss": 10.7328, "step": 19000 }, { "epoch": 36.0, "eval_accuracy": 0.23302564102564102, "eval_loss": 11.274002075195312, "eval_runtime": 5.2319, "eval_samples_per_second": 95.567, "eval_steps_per_second": 12.041, "step": 19044 }, { "epoch": 36.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "step": 19044 } ], "logging_steps": 100, "max_steps": 26450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.549816304708354e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }