{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.0, "eval_steps": 500, "global_step": 6348, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 7.269421577453613, "learning_rate": 0.0005, "loss": 1.9195, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 1.071650505065918, "learning_rate": 0.0005, "loss": 1.9396, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.5662499070167542, "learning_rate": 0.0005, "loss": 2.3641, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.47088223695755005, "learning_rate": 0.0005, "loss": 1.8969, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.3653932511806488, "learning_rate": 0.0005, "loss": 1.8531, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.6214529147982063, "eval_loss": 1.5580899715423584, "eval_runtime": 5.56, "eval_samples_per_second": 89.928, "eval_steps_per_second": 11.331, "step": 529 }, { "epoch": 1.0, "eval_exact_match": 14.4, "eval_f1": 20.983333333333345, "eval_qa_bleu": 12.514681390986086, "eval_qa_exact_match": 0.102, "eval_recite_bleu": 12.61307158352263, "eval_recite_exact_match": 0.0, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.5318541526794434, "learning_rate": 0.0005, "loss": 1.6622, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 235.6529998779297, "learning_rate": 0.0005, "loss": 1.5596, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5057249069213867, "learning_rate": 0.0005, "loss": 1.5296, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.5339187383651733, "learning_rate": 0.0005, "loss": 1.5036, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.5649825930595398, "learning_rate": 0.0005, "loss": 1.4884, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.6643587443946188, "eval_loss": 1.2189537286758423, "eval_runtime": 6.192, "eval_samples_per_second": 80.749, "eval_steps_per_second": 10.174, "step": 1058 }, { "epoch": 2.0, "eval_exact_match": 11.2, "eval_f1": 18.31238095238096, "eval_qa_bleu": 10.766593815425745, "eval_qa_exact_match": 0.096, "eval_recite_bleu": 13.257993783037875, "eval_recite_exact_match": 0.002, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.6042512655258179, "learning_rate": 0.0005, "loss": 1.2966, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 965.2459106445312, "learning_rate": 0.0005, "loss": 8.8153, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 3.0128896236419678, "learning_rate": 0.0005, "loss": 9.1241, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 108062179328.0, "learning_rate": 0.0005, "loss": 10.5947, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 5002999889920.0, "learning_rate": 0.0005, "loss": 12.5972, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.21054708520179372, "eval_loss": 19.724363327026367, "eval_runtime": 6.5285, "eval_samples_per_second": 76.587, "eval_steps_per_second": 9.65, "step": 1587 }, { "epoch": 3.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 62.19309616088867, "learning_rate": 0.0005, "loss": 13.3192, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 4.109750747680664, "learning_rate": 0.0005, "loss": 8.4804, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 7.620689392089844, "learning_rate": 0.0005, "loss": 8.347, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 33.092830657958984, "learning_rate": 0.0005, "loss": 7.4667, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 204828.203125, "learning_rate": 0.0005, "loss": 7.885, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 229958729728.0, "learning_rate": 0.0005, "loss": 8.4174, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.22234977578475337, "eval_loss": 9.6956148147583, "eval_runtime": 5.4245, "eval_samples_per_second": 92.174, "eval_steps_per_second": 11.614, "step": 2116 }, { "epoch": 4.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 17.098289489746094, "learning_rate": 0.0005, "loss": 10.4359, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 1069.9039306640625, "learning_rate": 0.0005, "loss": 9.5369, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 7.694222927093506, "learning_rate": 0.0005, "loss": 7.7028, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 8.4852, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 396939770200064.0, "learning_rate": 0.0005, "loss": 9.5996, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.2166188340807175, "eval_loss": 10.412571907043457, "eval_runtime": 5.8473, "eval_samples_per_second": 85.51, "eval_steps_per_second": 10.774, "step": 2645 }, { "epoch": 5.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 3.166937635134374e+18, "learning_rate": 0.0005, "loss": 10.3832, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 1.8363503072027607e+18, "learning_rate": 0.0005, "loss": 9.7717, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 113262112800768.0, "learning_rate": 0.0005, "loss": 8.3066, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 1283434880.0, "learning_rate": 0.0005, "loss": 8.6003, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1221250.5, "learning_rate": 0.0005, "loss": 9.1302, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.21948878923766815, "eval_loss": 9.434032440185547, "eval_runtime": 6.1208, "eval_samples_per_second": 81.689, "eval_steps_per_second": 10.293, "step": 3174 }, { "epoch": 6.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 2019.499267578125, "learning_rate": 0.0005, "loss": 9.3999, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 164.9034881591797, "learning_rate": 0.0005, "loss": 8.1768, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 434.93157958984375, "learning_rate": 0.0005, "loss": 7.6329, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 1324.7332763671875, "learning_rate": 0.0005, "loss": 7.7765, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 52531002605568.0, "learning_rate": 0.0005, "loss": 8.006, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 15.3926, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.23054708520179373, "eval_loss": 15.052428245544434, "eval_runtime": 5.4671, "eval_samples_per_second": 91.457, "eval_steps_per_second": 11.524, "step": 3703 }, { "epoch": 7.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 17.6263, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 17.7669, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 17.7462, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": Infinity, "learning_rate": 0.0005, "loss": 17.6186, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 1574.3041, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.21060089686098654, "eval_loss": NaN, "eval_runtime": 5.4376, "eval_samples_per_second": 91.952, "eval_steps_per_second": 11.586, "step": 4232 }, { "epoch": 8.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.21060089686098654, "eval_loss": NaN, "eval_runtime": 6.1031, "eval_samples_per_second": 81.925, "eval_steps_per_second": 10.323, "step": 4761 }, { "epoch": 9.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.21060089686098654, "eval_loss": NaN, "eval_runtime": 5.5143, "eval_samples_per_second": 90.673, "eval_steps_per_second": 11.425, "step": 5290 }, { "epoch": 10.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.21060089686098654, "eval_loss": NaN, "eval_runtime": 6.5846, "eval_samples_per_second": 75.934, "eval_steps_per_second": 9.568, "step": 5819 }, { "epoch": 11.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": NaN, "learning_rate": 0.0005, "loss": 0.0, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.21060089686098654, "eval_loss": NaN, "eval_runtime": 5.4929, "eval_samples_per_second": 91.027, "eval_steps_per_second": 11.469, "step": 6348 }, { "epoch": 12.0, "eval_exact_match": 0.0, "eval_f1": 0.0, "eval_qa_bleu": 0.0, "eval_qa_exact_match": 0.0, "eval_recite_bleu": 0.0, "eval_recite_exact_match": 0.0, "step": 6348 } ], "logging_steps": 100, "max_steps": 26450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.785483100511273e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }