{ "best_metric": null, "best_model_checkpoint": null, "epoch": 24.997333333333334, "eval_steps": 500, "global_step": 4687, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5333333333333333, "grad_norm": 0.271308958530426, "learning_rate": 5e-05, "loss": 1.8262, "step": 100 }, { "epoch": 0.9973333333333333, "eval_accuracy": 0.6047174887892377, "eval_loss": 1.7027028799057007, "eval_runtime": 6.6338, "eval_samples_per_second": 75.372, "eval_steps_per_second": 9.497, "step": 187 }, { "epoch": 0.9973333333333333, "eval_exact_match": 14.6, "eval_f1": 23.665714285714298, "eval_qa_bleu": 10.24028390049623, "eval_qa_exact_match": 0.114, "eval_recite_bleu": 12.525534079250063, "eval_recite_exact_match": 0.0, "step": 187 }, { "epoch": 1.0666666666666667, "grad_norm": 0.301651269197464, "learning_rate": 5e-05, "loss": 1.7424, "step": 200 }, { "epoch": 1.6, "grad_norm": 0.35949045419692993, "learning_rate": 5e-05, "loss": 1.6978, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.6068699551569506, "eval_loss": 1.6845285892486572, "eval_runtime": 5.7524, "eval_samples_per_second": 86.92, "eval_steps_per_second": 10.952, "step": 375 }, { "epoch": 2.0, "eval_exact_match": 14.6, "eval_f1": 23.74190476190477, "eval_qa_bleu": 9.70432698897927, "eval_qa_exact_match": 0.114, "eval_recite_bleu": 13.014822566317982, "eval_recite_exact_match": 0.006, "step": 375 }, { "epoch": 2.1333333333333333, "grad_norm": 0.41301965713500977, "learning_rate": 5e-05, "loss": 1.694, "step": 400 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5057485699653625, "learning_rate": 5e-05, "loss": 1.6324, "step": 500 }, { "epoch": 2.997333333333333, "eval_accuracy": 0.6079551569506726, "eval_loss": 1.6851012706756592, "eval_runtime": 5.5314, "eval_samples_per_second": 90.394, "eval_steps_per_second": 11.39, "step": 562 }, { "epoch": 2.997333333333333, "eval_exact_match": 14.2, "eval_f1": 23.29190476190477, "eval_qa_bleu": 10.724307586340888, "eval_qa_exact_match": 0.104, "eval_recite_bleu": 12.83531066630853, "eval_recite_exact_match": 0.004, "step": 562 }, { "epoch": 3.2, "grad_norm": 0.6363334655761719, "learning_rate": 5e-05, "loss": 1.6063, "step": 600 }, { "epoch": 3.7333333333333334, "grad_norm": 0.6466453671455383, "learning_rate": 5e-05, "loss": 1.5562, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.6071121076233184, "eval_loss": 1.6969172954559326, "eval_runtime": 6.2612, "eval_samples_per_second": 79.857, "eval_steps_per_second": 10.062, "step": 750 }, { "epoch": 4.0, "eval_exact_match": 14.0, "eval_f1": 21.34333333333334, "eval_qa_bleu": 10.250340106635626, "eval_qa_exact_match": 0.084, "eval_recite_bleu": 12.741830585824024, "eval_recite_exact_match": 0.004, "step": 750 }, { "epoch": 4.266666666666667, "grad_norm": 0.8162885904312134, "learning_rate": 5e-05, "loss": 1.5052, "step": 800 }, { "epoch": 4.8, "grad_norm": 0.8799439072608948, "learning_rate": 5e-05, "loss": 1.469, "step": 900 }, { "epoch": 4.997333333333334, "eval_accuracy": 0.6059282511210762, "eval_loss": 1.7216957807540894, "eval_runtime": 6.3012, "eval_samples_per_second": 79.35, "eval_steps_per_second": 9.998, "step": 937 }, { "epoch": 4.997333333333334, "eval_exact_match": 12.6, "eval_f1": 21.041428571428575, "eval_qa_bleu": 8.239984727774104, "eval_qa_exact_match": 0.094, "eval_recite_bleu": 12.327373697720192, "eval_recite_exact_match": 0.002, "step": 937 }, { "epoch": 5.333333333333333, "grad_norm": 0.9834449291229248, "learning_rate": 5e-05, "loss": 1.3845, "step": 1000 }, { "epoch": 5.866666666666667, "grad_norm": 0.9377098679542542, "learning_rate": 5e-05, "loss": 1.3927, "step": 1100 }, { "epoch": 6.0, "eval_accuracy": 0.6047354260089686, "eval_loss": 1.7564114332199097, "eval_runtime": 6.3447, "eval_samples_per_second": 78.806, "eval_steps_per_second": 9.93, "step": 1125 }, { "epoch": 6.0, "eval_exact_match": 13.4, "eval_f1": 21.70317460317461, "eval_qa_bleu": 10.522554916579404, "eval_qa_exact_match": 0.104, "eval_recite_bleu": 12.383604385812577, "eval_recite_exact_match": 0.006, "step": 1125 }, { "epoch": 6.4, "grad_norm": 1.1172072887420654, "learning_rate": 5e-05, "loss": 1.2924, "step": 1200 }, { "epoch": 6.933333333333334, "grad_norm": 1.0980473756790161, "learning_rate": 5e-05, "loss": 1.286, "step": 1300 }, { "epoch": 6.997333333333334, "eval_accuracy": 0.6026995515695067, "eval_loss": 1.7977122068405151, "eval_runtime": 5.9021, "eval_samples_per_second": 84.716, "eval_steps_per_second": 10.674, "step": 1312 }, { "epoch": 6.997333333333334, "eval_exact_match": 12.0, "eval_f1": 20.07142857142858, "eval_qa_bleu": 9.89685444999411, "eval_qa_exact_match": 0.102, "eval_recite_bleu": 12.555118604776576, "eval_recite_exact_match": 0.008, "step": 1312 }, { "epoch": 7.466666666666667, "grad_norm": 1.1534159183502197, "learning_rate": 5e-05, "loss": 1.1956, "step": 1400 }, { "epoch": 8.0, "grad_norm": 1.1209707260131836, "learning_rate": 5e-05, "loss": 1.1891, "step": 1500 }, { "epoch": 8.0, "eval_accuracy": 0.6010403587443947, "eval_loss": 1.840914249420166, "eval_runtime": 6.3016, "eval_samples_per_second": 79.345, "eval_steps_per_second": 9.997, "step": 1500 }, { "epoch": 8.0, "eval_exact_match": 12.8, "eval_f1": 21.598730158730167, "eval_qa_bleu": 9.863863368837224, "eval_qa_exact_match": 0.104, "eval_recite_bleu": 12.09730367063489, "eval_recite_exact_match": 0.01, "step": 1500 }, { "epoch": 8.533333333333333, "grad_norm": 1.3646217584609985, "learning_rate": 5e-05, "loss": 1.0861, "step": 1600 }, { "epoch": 8.997333333333334, "eval_accuracy": 0.5981255605381166, "eval_loss": 1.9223005771636963, "eval_runtime": 5.5092, "eval_samples_per_second": 90.757, "eval_steps_per_second": 11.435, "step": 1687 }, { "epoch": 8.997333333333334, "eval_exact_match": 14.8, "eval_f1": 22.160000000000004, "eval_qa_bleu": 8.357776258354589, "eval_qa_exact_match": 0.116, "eval_recite_bleu": 12.76267532884902, "eval_recite_exact_match": 0.01, "step": 1687 }, { "epoch": 9.066666666666666, "grad_norm": 1.6948473453521729, "learning_rate": 5e-05, "loss": 1.0935, "step": 1700 }, { "epoch": 9.6, "grad_norm": 1.6024364233016968, "learning_rate": 5e-05, "loss": 1.0127, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.5973632286995516, "eval_loss": 1.9414385557174683, "eval_runtime": 5.5824, "eval_samples_per_second": 89.568, "eval_steps_per_second": 11.286, "step": 1875 }, { "epoch": 10.0, "eval_exact_match": 12.4, "eval_f1": 20.75746031746032, "eval_qa_bleu": 8.760424886550503, "eval_qa_exact_match": 0.088, "eval_recite_bleu": 12.754935192029142, "eval_recite_exact_match": 0.016, "step": 1875 }, { "epoch": 10.133333333333333, "grad_norm": 1.6970032453536987, "learning_rate": 5e-05, "loss": 0.9788, "step": 1900 }, { "epoch": 10.666666666666666, "grad_norm": 1.559332251548767, "learning_rate": 5e-05, "loss": 0.926, "step": 2000 }, { "epoch": 10.997333333333334, "eval_accuracy": 0.5945201793721973, "eval_loss": 2.0455663204193115, "eval_runtime": 5.5259, "eval_samples_per_second": 90.484, "eval_steps_per_second": 11.401, "step": 2062 }, { "epoch": 10.997333333333334, "eval_exact_match": 13.0, "eval_f1": 20.56190476190476, "eval_qa_bleu": 11.437402190353293, "eval_qa_exact_match": 0.106, "eval_recite_bleu": 13.600786696850818, "eval_recite_exact_match": 0.022, "step": 2062 }, { "epoch": 11.2, "grad_norm": 1.667855143547058, "learning_rate": 5e-05, "loss": 0.8872, "step": 2100 }, { "epoch": 11.733333333333333, "grad_norm": 1.6040037870407104, "learning_rate": 5e-05, "loss": 0.8347, "step": 2200 }, { "epoch": 12.0, "eval_accuracy": 0.5923946188340807, "eval_loss": 2.116677761077881, "eval_runtime": 6.4038, "eval_samples_per_second": 78.078, "eval_steps_per_second": 9.838, "step": 2250 }, { "epoch": 12.0, "eval_exact_match": 13.6, "eval_f1": 20.470981240981256, "eval_qa_bleu": 8.968260765274712, "eval_qa_exact_match": 0.106, "eval_recite_bleu": 12.793280320213494, "eval_recite_exact_match": 0.014, "step": 2250 }, { "epoch": 12.266666666666667, "grad_norm": 1.794092059135437, "learning_rate": 5e-05, "loss": 0.8032, "step": 2300 }, { "epoch": 12.8, "grad_norm": 1.8774526119232178, "learning_rate": 5e-05, "loss": 0.7736, "step": 2400 }, { "epoch": 12.997333333333334, "eval_accuracy": 0.5902690582959641, "eval_loss": 2.179492950439453, "eval_runtime": 5.9945, "eval_samples_per_second": 83.409, "eval_steps_per_second": 10.51, "step": 2437 }, { "epoch": 12.997333333333334, "eval_exact_match": 12.8, "eval_f1": 20.780476190476197, "eval_qa_bleu": 11.176823944321667, "eval_qa_exact_match": 0.094, "eval_recite_bleu": 13.292972390103905, "eval_recite_exact_match": 0.02, "step": 2437 }, { "epoch": 13.333333333333334, "grad_norm": 1.9102168083190918, "learning_rate": 5e-05, "loss": 0.7168, "step": 2500 }, { "epoch": 13.866666666666667, "grad_norm": 1.9183365106582642, "learning_rate": 5e-05, "loss": 0.6903, "step": 2600 }, { "epoch": 14.0, "eval_accuracy": 0.5875874439461883, "eval_loss": 2.274404287338257, "eval_runtime": 6.2502, "eval_samples_per_second": 79.998, "eval_steps_per_second": 10.08, "step": 2625 }, { "epoch": 14.0, "eval_exact_match": 11.2, "eval_f1": 18.26571428571429, "eval_qa_bleu": 10.347091943612007, "eval_qa_exact_match": 0.082, "eval_recite_bleu": 12.720454352082847, "eval_recite_exact_match": 0.014, "step": 2625 }, { "epoch": 14.4, "grad_norm": 2.326387405395508, "learning_rate": 5e-05, "loss": 0.6312, "step": 2700 }, { "epoch": 14.933333333333334, "grad_norm": 2.1555840969085693, "learning_rate": 5e-05, "loss": 0.6267, "step": 2800 }, { "epoch": 14.997333333333334, "eval_accuracy": 0.5870582959641255, "eval_loss": 2.338937997817993, "eval_runtime": 6.643, "eval_samples_per_second": 75.267, "eval_steps_per_second": 9.484, "step": 2812 }, { "epoch": 14.997333333333334, "eval_exact_match": 10.6, "eval_f1": 18.311471861471865, "eval_qa_bleu": 9.56012066081651, "eval_qa_exact_match": 0.086, "eval_recite_bleu": 12.790292247765784, "eval_recite_exact_match": 0.022, "step": 2812 }, { "epoch": 15.466666666666667, "grad_norm": 1.8364830017089844, "learning_rate": 5e-05, "loss": 0.5582, "step": 2900 }, { "epoch": 16.0, "grad_norm": 1.9223560094833374, "learning_rate": 5e-05, "loss": 0.5673, "step": 3000 }, { "epoch": 16.0, "eval_accuracy": 0.585067264573991, "eval_loss": 2.4404428005218506, "eval_runtime": 6.2254, "eval_samples_per_second": 80.316, "eval_steps_per_second": 10.12, "step": 3000 }, { "epoch": 16.0, "eval_exact_match": 11.4, "eval_f1": 18.553809523809527, "eval_qa_bleu": 7.054735448183399, "eval_qa_exact_match": 0.088, "eval_recite_bleu": 12.41180854100743, "eval_recite_exact_match": 0.018, "step": 3000 }, { "epoch": 16.533333333333335, "grad_norm": 2.403259515762329, "learning_rate": 5e-05, "loss": 0.4886, "step": 3100 }, { "epoch": 16.997333333333334, "eval_accuracy": 0.5844035874439462, "eval_loss": 2.5225181579589844, "eval_runtime": 5.9008, "eval_samples_per_second": 84.734, "eval_steps_per_second": 10.677, "step": 3187 }, { "epoch": 16.997333333333334, "eval_exact_match": 10.6, "eval_f1": 17.309567099567108, "eval_qa_bleu": 7.6942420610670945, "eval_qa_exact_match": 0.086, "eval_recite_bleu": 11.974979252669918, "eval_recite_exact_match": 0.016, "step": 3187 }, { "epoch": 17.066666666666666, "grad_norm": 2.1279492378234863, "learning_rate": 5e-05, "loss": 0.4985, "step": 3200 }, { "epoch": 17.6, "grad_norm": 2.377486228942871, "learning_rate": 5e-05, "loss": 0.4357, "step": 3300 }, { "epoch": 18.0, "eval_accuracy": 0.5838026905829596, "eval_loss": 2.590667724609375, "eval_runtime": 5.984, "eval_samples_per_second": 83.556, "eval_steps_per_second": 10.528, "step": 3375 }, { "epoch": 18.0, "eval_exact_match": 10.8, "eval_f1": 18.150000000000006, "eval_qa_bleu": 10.081752461945443, "eval_qa_exact_match": 0.078, "eval_recite_bleu": 12.17977433983274, "eval_recite_exact_match": 0.018, "step": 3375 }, { "epoch": 18.133333333333333, "grad_norm": 2.425731897354126, "learning_rate": 5e-05, "loss": 0.4424, "step": 3400 }, { "epoch": 18.666666666666668, "grad_norm": 2.4585390090942383, "learning_rate": 5e-05, "loss": 0.3873, "step": 3500 }, { "epoch": 18.997333333333334, "eval_accuracy": 0.5823228699551569, "eval_loss": 2.6423120498657227, "eval_runtime": 5.5641, "eval_samples_per_second": 89.861, "eval_steps_per_second": 11.323, "step": 3562 }, { "epoch": 18.997333333333334, "eval_exact_match": 11.0, "eval_f1": 18.006349206349213, "eval_qa_bleu": 8.890762850928647, "eval_qa_exact_match": 0.092, "eval_recite_bleu": 12.489755382776915, "eval_recite_exact_match": 0.022, "step": 3562 }, { "epoch": 19.2, "grad_norm": 2.00604510307312, "learning_rate": 5e-05, "loss": 0.383, "step": 3600 }, { "epoch": 19.733333333333334, "grad_norm": 2.393627405166626, "learning_rate": 5e-05, "loss": 0.3497, "step": 3700 }, { "epoch": 20.0, "eval_accuracy": 0.5803587443946189, "eval_loss": 2.7260048389434814, "eval_runtime": 6.546, "eval_samples_per_second": 76.382, "eval_steps_per_second": 9.624, "step": 3750 }, { "epoch": 20.0, "eval_exact_match": 9.6, "eval_f1": 15.837460317460323, "eval_qa_bleu": 7.780402070269215, "eval_qa_exact_match": 0.078, "eval_recite_bleu": 12.344573606229106, "eval_recite_exact_match": 0.024, "step": 3750 }, { "epoch": 20.266666666666666, "grad_norm": 2.1227269172668457, "learning_rate": 5e-05, "loss": 0.3321, "step": 3800 }, { "epoch": 20.8, "grad_norm": 2.2578742504119873, "learning_rate": 5e-05, "loss": 0.314, "step": 3900 }, { "epoch": 20.997333333333334, "eval_accuracy": 0.5801883408071749, "eval_loss": 2.820314884185791, "eval_runtime": 5.6037, "eval_samples_per_second": 89.227, "eval_steps_per_second": 11.243, "step": 3937 }, { "epoch": 20.997333333333334, "eval_exact_match": 12.4, "eval_f1": 17.80337662337662, "eval_qa_bleu": 9.273311361506353, "eval_qa_exact_match": 0.096, "eval_recite_bleu": 12.39691014320001, "eval_recite_exact_match": 0.018, "step": 3937 }, { "epoch": 21.333333333333332, "grad_norm": 1.9079784154891968, "learning_rate": 5e-05, "loss": 0.2951, "step": 4000 }, { "epoch": 21.866666666666667, "grad_norm": 2.5200765132904053, "learning_rate": 5e-05, "loss": 0.2893, "step": 4100 }, { "epoch": 22.0, "eval_accuracy": 0.5794260089686099, "eval_loss": 2.905937433242798, "eval_runtime": 5.8733, "eval_samples_per_second": 85.132, "eval_steps_per_second": 10.727, "step": 4125 }, { "epoch": 22.0, "eval_exact_match": 10.2, "eval_f1": 16.468253968253975, "eval_qa_bleu": 6.31129540696892, "eval_qa_exact_match": 0.084, "eval_recite_bleu": 12.11260232701576, "eval_recite_exact_match": 0.022, "step": 4125 }, { "epoch": 22.4, "grad_norm": 1.9616879224777222, "learning_rate": 5e-05, "loss": 0.2584, "step": 4200 }, { "epoch": 22.933333333333334, "grad_norm": 1.854027271270752, "learning_rate": 5e-05, "loss": 0.2583, "step": 4300 }, { "epoch": 22.997333333333334, "eval_accuracy": 0.578914798206278, "eval_loss": 2.9786081314086914, "eval_runtime": 6.2893, "eval_samples_per_second": 79.5, "eval_steps_per_second": 10.017, "step": 4312 }, { "epoch": 22.997333333333334, "eval_exact_match": 10.2, "eval_f1": 16.564126984126986, "eval_qa_bleu": 6.2299815854348815, "eval_qa_exact_match": 0.086, "eval_recite_bleu": 11.81704787803335, "eval_recite_exact_match": 0.024, "step": 4312 }, { "epoch": 23.466666666666665, "grad_norm": 1.8527085781097412, "learning_rate": 5e-05, "loss": 0.2289, "step": 4400 }, { "epoch": 24.0, "grad_norm": 2.258077621459961, "learning_rate": 5e-05, "loss": 0.2382, "step": 4500 }, { "epoch": 24.0, "eval_accuracy": 0.5789596412556054, "eval_loss": 3.0398380756378174, "eval_runtime": 6.5231, "eval_samples_per_second": 76.651, "eval_steps_per_second": 9.658, "step": 4500 }, { "epoch": 24.0, "eval_exact_match": 10.2, "eval_f1": 16.815281385281384, "eval_qa_bleu": 6.817434246718267, "eval_qa_exact_match": 0.082, "eval_recite_bleu": 12.918682221726373, "eval_recite_exact_match": 0.022, "step": 4500 }, { "epoch": 24.533333333333335, "grad_norm": 2.0298190116882324, "learning_rate": 5e-05, "loss": 0.2051, "step": 4600 }, { "epoch": 24.997333333333334, "eval_accuracy": 0.5780627802690583, "eval_loss": 3.1147310733795166, "eval_runtime": 6.2317, "eval_samples_per_second": 80.236, "eval_steps_per_second": 10.11, "step": 4687 }, { "epoch": 24.997333333333334, "eval_exact_match": 10.8, "eval_f1": 17.1636507936508, "eval_qa_bleu": 8.438045873055385, "eval_qa_exact_match": 0.09, "eval_recite_bleu": 12.425198158877508, "eval_recite_exact_match": 0.028, "step": 4687 } ], "logging_steps": 100, "max_steps": 9350, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.721636404352778e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }