{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.997333333333334, "eval_steps": 500, "global_step": 2437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5333333333333333, "grad_norm": 0.2409825623035431, "learning_rate": 0.0001, "loss": 1.7886, "step": 100 }, { "epoch": 0.9973333333333333, "eval_accuracy": 0.6061076233183856, "eval_loss": 1.6901254653930664, "eval_runtime": 5.5484, "eval_samples_per_second": 90.116, "eval_steps_per_second": 11.355, "step": 187 }, { "epoch": 1.0666666666666667, "grad_norm": 0.2729445993900299, "learning_rate": 0.0001, "loss": 1.7283, "step": 200 }, { "epoch": 1.6, "grad_norm": 0.37678101658821106, "learning_rate": 0.0001, "loss": 1.6544, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.607677130044843, "eval_loss": 1.6766204833984375, "eval_runtime": 6.4294, "eval_samples_per_second": 77.768, "eval_steps_per_second": 9.799, "step": 375 }, { "epoch": 2.1333333333333333, "grad_norm": 0.4604090750217438, "learning_rate": 0.0001, "loss": 1.6338, "step": 400 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5606330633163452, "learning_rate": 0.0001, "loss": 1.5273, "step": 500 }, { "epoch": 2.997333333333333, "eval_accuracy": 0.6079910313901346, "eval_loss": 1.6929064989089966, "eval_runtime": 5.431, "eval_samples_per_second": 92.063, "eval_steps_per_second": 11.6, "step": 562 }, { "epoch": 3.2, "grad_norm": 0.6363856196403503, "learning_rate": 0.0001, "loss": 1.4796, "step": 600 }, { "epoch": 3.7333333333333334, "grad_norm": 0.5985731482505798, "learning_rate": 0.0001, "loss": 1.3871, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.606896860986547, "eval_loss": 1.7257370948791504, "eval_runtime": 5.8318, "eval_samples_per_second": 85.737, "eval_steps_per_second": 10.803, "step": 750 }, { "epoch": 4.266666666666667, "grad_norm": 0.7769015431404114, "learning_rate": 0.0001, "loss": 1.3013, "step": 800 }, { "epoch": 4.8, "grad_norm": 0.708043098449707, "learning_rate": 0.0001, "loss": 1.23, "step": 900 }, { "epoch": 4.997333333333334, "eval_accuracy": 0.6061076233183856, "eval_loss": 1.7812739610671997, "eval_runtime": 5.9335, "eval_samples_per_second": 84.267, "eval_steps_per_second": 10.618, "step": 937 }, { "epoch": 5.333333333333333, "grad_norm": 0.8580867648124695, "learning_rate": 0.0001, "loss": 1.1011, "step": 1000 }, { "epoch": 5.866666666666667, "grad_norm": 0.9232878684997559, "learning_rate": 0.0001, "loss": 1.0749, "step": 1100 }, { "epoch": 6.0, "eval_accuracy": 0.6018026905829597, "eval_loss": 1.877648949623108, "eval_runtime": 6.474, "eval_samples_per_second": 77.231, "eval_steps_per_second": 9.731, "step": 1125 }, { "epoch": 6.4, "grad_norm": 1.0364516973495483, "learning_rate": 0.0001, "loss": 0.9164, "step": 1200 }, { "epoch": 6.933333333333334, "grad_norm": 1.0082701444625854, "learning_rate": 0.0001, "loss": 0.8957, "step": 1300 }, { "epoch": 6.997333333333334, "eval_accuracy": 0.5997937219730942, "eval_loss": 1.978247046470642, "eval_runtime": 5.9491, "eval_samples_per_second": 84.047, "eval_steps_per_second": 10.59, "step": 1312 }, { "epoch": 7.466666666666667, "grad_norm": 1.0375924110412598, "learning_rate": 0.0001, "loss": 0.7382, "step": 1400 }, { "epoch": 8.0, "grad_norm": 1.115308403968811, "learning_rate": 0.0001, "loss": 0.729, "step": 1500 }, { "epoch": 8.0, "eval_accuracy": 0.5965919282511211, "eval_loss": 2.097362756729126, "eval_runtime": 6.5905, "eval_samples_per_second": 75.867, "eval_steps_per_second": 9.559, "step": 1500 }, { "epoch": 8.533333333333333, "grad_norm": 1.3251475095748901, "learning_rate": 0.0001, "loss": 0.5643, "step": 1600 }, { "epoch": 8.997333333333334, "eval_accuracy": 0.5931300448430493, "eval_loss": 2.2553179264068604, "eval_runtime": 6.1864, "eval_samples_per_second": 80.823, "eval_steps_per_second": 10.184, "step": 1687 }, { "epoch": 9.066666666666666, "grad_norm": 1.0753710269927979, "learning_rate": 0.0001, "loss": 0.5706, "step": 1700 }, { "epoch": 9.6, "grad_norm": 1.5658849477767944, "learning_rate": 0.0001, "loss": 0.4538, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.5900627802690583, "eval_loss": 2.408897876739502, "eval_runtime": 6.4582, "eval_samples_per_second": 77.422, "eval_steps_per_second": 9.755, "step": 1875 }, { "epoch": 10.133333333333333, "grad_norm": 1.6254231929779053, "learning_rate": 0.0001, "loss": 0.4286, "step": 1900 }, { "epoch": 10.666666666666666, "grad_norm": 1.187232255935669, "learning_rate": 0.0001, "loss": 0.3563, "step": 2000 }, { "epoch": 10.997333333333334, "eval_accuracy": 0.588914798206278, "eval_loss": 2.529802083969116, "eval_runtime": 5.9806, "eval_samples_per_second": 83.604, "eval_steps_per_second": 10.534, "step": 2062 }, { "epoch": 11.2, "grad_norm": 1.3838512897491455, "learning_rate": 0.0001, "loss": 0.3256, "step": 2100 }, { "epoch": 11.733333333333333, "grad_norm": 1.249315619468689, "learning_rate": 0.0001, "loss": 0.2787, "step": 2200 }, { "epoch": 12.0, "eval_accuracy": 0.5870582959641255, "eval_loss": 2.684791088104248, "eval_runtime": 5.7871, "eval_samples_per_second": 86.399, "eval_steps_per_second": 10.886, "step": 2250 }, { "epoch": 12.266666666666667, "grad_norm": 1.163707971572876, "learning_rate": 0.0001, "loss": 0.2558, "step": 2300 }, { "epoch": 12.8, "grad_norm": 1.092466950416565, "learning_rate": 0.0001, "loss": 0.2314, "step": 2400 }, { "epoch": 12.997333333333334, "eval_accuracy": 0.5863408071748879, "eval_loss": 2.7943203449249268, "eval_runtime": 6.5027, "eval_samples_per_second": 76.891, "eval_steps_per_second": 9.688, "step": 2437 } ], "logging_steps": 100, "max_steps": 9350, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4552508870990234e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }