{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.0, "eval_steps": 500, "global_step": 3375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5333333333333333, "grad_norm": 0.2409825623035431, "learning_rate": 0.0001, "loss": 1.7886, "step": 100 }, { "epoch": 0.9973333333333333, "eval_accuracy": 0.6061076233183856, "eval_loss": 1.6901254653930664, "eval_runtime": 5.5484, "eval_samples_per_second": 90.116, "eval_steps_per_second": 11.355, "step": 187 }, { "epoch": 1.0666666666666667, "grad_norm": 0.2729445993900299, "learning_rate": 0.0001, "loss": 1.7283, "step": 200 }, { "epoch": 1.6, "grad_norm": 0.37678101658821106, "learning_rate": 0.0001, "loss": 1.6544, "step": 300 }, { "epoch": 2.0, "eval_accuracy": 0.607677130044843, "eval_loss": 1.6766204833984375, "eval_runtime": 6.4294, "eval_samples_per_second": 77.768, "eval_steps_per_second": 9.799, "step": 375 }, { "epoch": 2.1333333333333333, "grad_norm": 0.4604090750217438, "learning_rate": 0.0001, "loss": 1.6338, "step": 400 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5606330633163452, "learning_rate": 0.0001, "loss": 1.5273, "step": 500 }, { "epoch": 2.997333333333333, "eval_accuracy": 0.6079910313901346, "eval_loss": 1.6929064989089966, "eval_runtime": 5.431, "eval_samples_per_second": 92.063, "eval_steps_per_second": 11.6, "step": 562 }, { "epoch": 3.2, "grad_norm": 0.6363856196403503, "learning_rate": 0.0001, "loss": 1.4796, "step": 600 }, { "epoch": 3.7333333333333334, "grad_norm": 0.5985731482505798, "learning_rate": 0.0001, "loss": 1.3871, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.606896860986547, "eval_loss": 1.7257370948791504, "eval_runtime": 5.8318, "eval_samples_per_second": 85.737, "eval_steps_per_second": 10.803, "step": 750 }, { "epoch": 4.266666666666667, "grad_norm": 0.7769015431404114, "learning_rate": 0.0001, "loss": 1.3013, "step": 800 }, { "epoch": 4.8, "grad_norm": 0.708043098449707, "learning_rate": 0.0001, "loss": 1.23, "step": 900 }, { "epoch": 4.997333333333334, "eval_accuracy": 0.6061076233183856, "eval_loss": 1.7812739610671997, "eval_runtime": 5.9335, "eval_samples_per_second": 84.267, "eval_steps_per_second": 10.618, "step": 937 }, { "epoch": 5.333333333333333, "grad_norm": 0.8580867648124695, "learning_rate": 0.0001, "loss": 1.1011, "step": 1000 }, { "epoch": 5.866666666666667, "grad_norm": 0.9232878684997559, "learning_rate": 0.0001, "loss": 1.0749, "step": 1100 }, { "epoch": 6.0, "eval_accuracy": 0.6018026905829597, "eval_loss": 1.877648949623108, "eval_runtime": 6.474, "eval_samples_per_second": 77.231, "eval_steps_per_second": 9.731, "step": 1125 }, { "epoch": 6.4, "grad_norm": 1.0364516973495483, "learning_rate": 0.0001, "loss": 0.9164, "step": 1200 }, { "epoch": 6.933333333333334, "grad_norm": 1.0082701444625854, "learning_rate": 0.0001, "loss": 0.8957, "step": 1300 }, { "epoch": 6.997333333333334, "eval_accuracy": 0.5997937219730942, "eval_loss": 1.978247046470642, "eval_runtime": 5.9491, "eval_samples_per_second": 84.047, "eval_steps_per_second": 10.59, "step": 1312 }, { "epoch": 7.466666666666667, "grad_norm": 1.0375924110412598, "learning_rate": 0.0001, "loss": 0.7382, "step": 1400 }, { "epoch": 8.0, "grad_norm": 1.115308403968811, "learning_rate": 0.0001, "loss": 0.729, "step": 1500 }, { "epoch": 8.0, "eval_accuracy": 0.5965919282511211, "eval_loss": 2.097362756729126, "eval_runtime": 6.5905, "eval_samples_per_second": 75.867, "eval_steps_per_second": 9.559, "step": 1500 }, { "epoch": 8.533333333333333, "grad_norm": 1.3251475095748901, "learning_rate": 0.0001, "loss": 0.5643, "step": 1600 }, { "epoch": 8.997333333333334, "eval_accuracy": 0.5931300448430493, "eval_loss": 2.2553179264068604, "eval_runtime": 6.1864, "eval_samples_per_second": 80.823, "eval_steps_per_second": 10.184, "step": 1687 }, { "epoch": 9.066666666666666, "grad_norm": 1.0753710269927979, "learning_rate": 0.0001, "loss": 0.5706, "step": 1700 }, { "epoch": 9.6, "grad_norm": 1.5658849477767944, "learning_rate": 0.0001, "loss": 0.4538, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.5900627802690583, "eval_loss": 2.408897876739502, "eval_runtime": 6.4582, "eval_samples_per_second": 77.422, "eval_steps_per_second": 9.755, "step": 1875 }, { "epoch": 10.133333333333333, "grad_norm": 1.6254231929779053, "learning_rate": 0.0001, "loss": 0.4286, "step": 1900 }, { "epoch": 10.666666666666666, "grad_norm": 1.187232255935669, "learning_rate": 0.0001, "loss": 0.3563, "step": 2000 }, { "epoch": 10.997333333333334, "eval_accuracy": 0.588914798206278, "eval_loss": 2.529802083969116, "eval_runtime": 5.9806, "eval_samples_per_second": 83.604, "eval_steps_per_second": 10.534, "step": 2062 }, { "epoch": 11.2, "grad_norm": 1.3838512897491455, "learning_rate": 0.0001, "loss": 0.3256, "step": 2100 }, { "epoch": 11.733333333333333, "grad_norm": 1.249315619468689, "learning_rate": 0.0001, "loss": 0.2787, "step": 2200 }, { "epoch": 12.0, "eval_accuracy": 0.5870582959641255, "eval_loss": 2.684791088104248, "eval_runtime": 5.7871, "eval_samples_per_second": 86.399, "eval_steps_per_second": 10.886, "step": 2250 }, { "epoch": 12.266666666666667, "grad_norm": 1.163707971572876, "learning_rate": 0.0001, "loss": 0.2558, "step": 2300 }, { "epoch": 12.8, "grad_norm": 1.092466950416565, "learning_rate": 0.0001, "loss": 0.2314, "step": 2400 }, { "epoch": 12.997333333333334, "eval_accuracy": 0.5863408071748879, "eval_loss": 2.7943203449249268, "eval_runtime": 6.5027, "eval_samples_per_second": 76.891, "eval_steps_per_second": 9.688, "step": 2437 }, { "epoch": 13.333333333333334, "grad_norm": 1.1065410375595093, "learning_rate": 0.0001, "loss": 0.2029, "step": 2500 }, { "epoch": 13.866666666666667, "grad_norm": 1.1960605382919312, "learning_rate": 0.0001, "loss": 0.1923, "step": 2600 }, { "epoch": 14.0, "eval_accuracy": 0.5856502242152466, "eval_loss": 2.8624069690704346, "eval_runtime": 5.5405, "eval_samples_per_second": 90.244, "eval_steps_per_second": 11.371, "step": 2625 }, { "epoch": 14.4, "grad_norm": 1.014957070350647, "learning_rate": 0.0001, "loss": 0.1661, "step": 2700 }, { "epoch": 14.933333333333334, "grad_norm": 1.2782268524169922, "learning_rate": 0.0001, "loss": 0.1687, "step": 2800 }, { "epoch": 14.997333333333334, "eval_accuracy": 0.5847533632286995, "eval_loss": 2.978261947631836, "eval_runtime": 6.5241, "eval_samples_per_second": 76.639, "eval_steps_per_second": 9.656, "step": 2812 }, { "epoch": 15.466666666666667, "grad_norm": 1.0024491548538208, "learning_rate": 0.0001, "loss": 0.1464, "step": 2900 }, { "epoch": 16.0, "grad_norm": 0.9853018522262573, "learning_rate": 0.0001, "loss": 0.1514, "step": 3000 }, { "epoch": 16.0, "eval_accuracy": 0.5849865470852018, "eval_loss": 3.0238428115844727, "eval_runtime": 5.8457, "eval_samples_per_second": 85.533, "eval_steps_per_second": 10.777, "step": 3000 }, { "epoch": 16.533333333333335, "grad_norm": 1.2373902797698975, "learning_rate": 0.0001, "loss": 0.1282, "step": 3100 }, { "epoch": 16.997333333333334, "eval_accuracy": 0.5841614349775784, "eval_loss": 3.091371774673462, "eval_runtime": 5.5018, "eval_samples_per_second": 90.879, "eval_steps_per_second": 11.451, "step": 3187 }, { "epoch": 17.066666666666666, "grad_norm": 0.9496106505393982, "learning_rate": 0.0001, "loss": 0.1358, "step": 3200 }, { "epoch": 17.6, "grad_norm": 1.0109747648239136, "learning_rate": 0.0001, "loss": 0.121, "step": 3300 }, { "epoch": 18.0, "eval_accuracy": 0.5847982062780269, "eval_loss": 3.143228530883789, "eval_runtime": 5.4395, "eval_samples_per_second": 91.919, "eval_steps_per_second": 11.582, "step": 3375 } ], "logging_steps": 100, "max_steps": 9350, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.399578187296932e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }