{ "best_metric": 0.19640253484249115, "best_model_checkpoint": "/kaggle/working/hubert-amharic/checkpoint-1500", "epoch": 12.121212121212121, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.40404040404040403, "grad_norm": 2.795124053955078, "learning_rate": 9.730094466936572e-06, "loss": 1.5155, "step": 100 }, { "epoch": 0.8080808080808081, "grad_norm": 2.900432825088501, "learning_rate": 9.460188933873145e-06, "loss": 1.175, "step": 200 }, { "epoch": 1.2121212121212122, "grad_norm": 3.376126766204834, "learning_rate": 9.195681511470986e-06, "loss": 0.7957, "step": 300 }, { "epoch": 1.6161616161616161, "grad_norm": 6.262876987457275, "learning_rate": 8.925775978407558e-06, "loss": 0.6227, "step": 400 }, { "epoch": 2.0202020202020203, "grad_norm": 13.498932838439941, "learning_rate": 8.65587044534413e-06, "loss": 0.5372, "step": 500 }, { "epoch": 2.0202020202020203, "eval_accuracy": 0.8929293155670166, "eval_loss": 0.3763836622238159, "eval_runtime": 22.9508, "eval_samples_per_second": 21.568, "eval_steps_per_second": 5.403, "step": 500 }, { "epoch": 2.4242424242424243, "grad_norm": 7.660800457000732, "learning_rate": 8.388663967611337e-06, "loss": 0.4195, "step": 600 }, { "epoch": 2.8282828282828283, "grad_norm": 10.333969116210938, "learning_rate": 8.118758434547908e-06, "loss": 0.4207, "step": 700 }, { "epoch": 3.2323232323232323, "grad_norm": 25.161924362182617, "learning_rate": 7.848852901484481e-06, "loss": 0.3231, "step": 800 }, { "epoch": 3.6363636363636362, "grad_norm": 0.32242974638938904, "learning_rate": 7.578947368421054e-06, "loss": 0.236, "step": 900 }, { "epoch": 4.040404040404041, "grad_norm": 4.83636474609375, "learning_rate": 7.309041835357625e-06, "loss": 0.2254, "step": 1000 }, { "epoch": 4.040404040404041, "eval_accuracy": 0.9111111164093018, "eval_loss": 0.3798251748085022, "eval_runtime": 23.0284, "eval_samples_per_second": 21.495, "eval_steps_per_second": 5.385, "step": 1000 }, { "epoch": 4.444444444444445, "grad_norm": 10.075101852416992, "learning_rate": 7.039136302294197e-06, "loss": 0.2038, "step": 1100 }, { "epoch": 4.848484848484849, "grad_norm": 5.713657379150391, "learning_rate": 6.76923076923077e-06, "loss": 0.2233, "step": 1200 }, { "epoch": 5.252525252525253, "grad_norm": 64.90503692626953, "learning_rate": 6.499325236167342e-06, "loss": 0.2077, "step": 1300 }, { "epoch": 5.656565656565657, "grad_norm": 33.007598876953125, "learning_rate": 6.229419703103914e-06, "loss": 0.1811, "step": 1400 }, { "epoch": 6.0606060606060606, "grad_norm": 2.3471884727478027, "learning_rate": 5.959514170040487e-06, "loss": 0.1699, "step": 1500 }, { "epoch": 6.0606060606060606, "eval_accuracy": 0.9535353779792786, "eval_loss": 0.19640253484249115, "eval_runtime": 22.503, "eval_samples_per_second": 21.997, "eval_steps_per_second": 5.51, "step": 1500 }, { "epoch": 6.4646464646464645, "grad_norm": 46.472137451171875, "learning_rate": 5.692307692307692e-06, "loss": 0.1676, "step": 1600 }, { "epoch": 6.8686868686868685, "grad_norm": 0.10037334263324738, "learning_rate": 5.4224021592442655e-06, "loss": 0.1468, "step": 1700 }, { "epoch": 7.2727272727272725, "grad_norm": 2.2828967571258545, "learning_rate": 5.152496626180837e-06, "loss": 0.1122, "step": 1800 }, { "epoch": 7.6767676767676765, "grad_norm": 2.2225501537323, "learning_rate": 4.882591093117409e-06, "loss": 0.1162, "step": 1900 }, { "epoch": 8.080808080808081, "grad_norm": 3.6428568363189697, "learning_rate": 4.6126855600539814e-06, "loss": 0.1245, "step": 2000 }, { "epoch": 8.080808080808081, "eval_accuracy": 0.9595959782600403, "eval_loss": 0.22902674973011017, "eval_runtime": 22.8105, "eval_samples_per_second": 21.701, "eval_steps_per_second": 5.436, "step": 2000 }, { "epoch": 8.484848484848484, "grad_norm": 0.09281215816736221, "learning_rate": 4.342780026990554e-06, "loss": 0.1027, "step": 2100 }, { "epoch": 8.88888888888889, "grad_norm": 30.872848510742188, "learning_rate": 4.072874493927126e-06, "loss": 0.105, "step": 2200 }, { "epoch": 9.292929292929292, "grad_norm": 0.3525031507015228, "learning_rate": 3.8029689608636982e-06, "loss": 0.1184, "step": 2300 }, { "epoch": 9.696969696969697, "grad_norm": 0.40138208866119385, "learning_rate": 3.53306342780027e-06, "loss": 0.1048, "step": 2400 }, { "epoch": 10.1010101010101, "grad_norm": 0.025619197636842728, "learning_rate": 3.2631578947368423e-06, "loss": 0.0597, "step": 2500 }, { "epoch": 10.1010101010101, "eval_accuracy": 0.9636363387107849, "eval_loss": 0.22432678937911987, "eval_runtime": 22.7183, "eval_samples_per_second": 21.789, "eval_steps_per_second": 5.458, "step": 2500 }, { "epoch": 10.505050505050505, "grad_norm": 0.028187109157443047, "learning_rate": 2.9932523616734146e-06, "loss": 0.0525, "step": 2600 }, { "epoch": 10.909090909090908, "grad_norm": 0.020364606752991676, "learning_rate": 2.723346828609987e-06, "loss": 0.1105, "step": 2700 }, { "epoch": 11.313131313131313, "grad_norm": 18.78951644897461, "learning_rate": 2.453441295546559e-06, "loss": 0.078, "step": 2800 }, { "epoch": 11.717171717171716, "grad_norm": 48.03165817260742, "learning_rate": 2.183535762483131e-06, "loss": 0.1166, "step": 2900 }, { "epoch": 12.121212121212121, "grad_norm": 0.21182887256145477, "learning_rate": 1.913630229419703e-06, "loss": 0.0816, "step": 3000 }, { "epoch": 12.121212121212121, "eval_accuracy": 0.9555555582046509, "eval_loss": 0.27169349789619446, "eval_runtime": 22.618, "eval_samples_per_second": 21.885, "eval_steps_per_second": 5.482, "step": 3000 }, { "epoch": 12.121212121212121, "step": 3000, "total_flos": 7.685447856522912e+17, "train_loss": 0.2951181084314982, "train_runtime": 1630.5884, "train_samples_per_second": 18.205, "train_steps_per_second": 2.272 } ], "logging_steps": 100, "max_steps": 3705, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.685447856522912e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }