{ "best_metric": 0.09009132534265518, "best_model_checkpoint": "/kaggle/working/xls-r-amharic/checkpoint-2500", "epoch": 14.969696969696969, "eval_steps": 500, "global_step": 3705, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.40404040404040403, "grad_norm": 3.749224901199341, "learning_rate": 9.730094466936572e-06, "loss": 1.1928, "step": 100 }, { "epoch": 0.8080808080808081, "grad_norm": 5.272401809692383, "learning_rate": 9.46288798920378e-06, "loss": 0.7006, "step": 200 }, { "epoch": 1.2121212121212122, "grad_norm": 2.211305618286133, "learning_rate": 9.192982456140351e-06, "loss": 0.3993, "step": 300 }, { "epoch": 1.6161616161616161, "grad_norm": 0.4073326289653778, "learning_rate": 8.923076923076925e-06, "loss": 0.2935, "step": 400 }, { "epoch": 2.0202020202020203, "grad_norm": 10.367269515991211, "learning_rate": 8.65587044534413e-06, "loss": 0.2847, "step": 500 }, { "epoch": 2.0202020202020203, "eval_accuracy": 0.9212121367454529, "eval_loss": 0.247890442609787, "eval_runtime": 43.8986, "eval_samples_per_second": 11.276, "eval_steps_per_second": 2.825, "step": 500 }, { "epoch": 2.4242424242424243, "grad_norm": 14.821730613708496, "learning_rate": 8.385964912280704e-06, "loss": 0.1788, "step": 600 }, { "epoch": 2.8282828282828283, "grad_norm": 1.658502221107483, "learning_rate": 8.116059379217275e-06, "loss": 0.1541, "step": 700 }, { "epoch": 3.2323232323232323, "grad_norm": 7.924429416656494, "learning_rate": 7.846153846153847e-06, "loss": 0.1683, "step": 800 }, { "epoch": 3.6363636363636362, "grad_norm": 0.14956633746623993, "learning_rate": 7.576248313090419e-06, "loss": 0.1315, "step": 900 }, { "epoch": 4.040404040404041, "grad_norm": 0.08813250064849854, "learning_rate": 7.306342780026991e-06, "loss": 0.1138, "step": 1000 }, { "epoch": 4.040404040404041, "eval_accuracy": 0.9434343576431274, "eval_loss": 0.20633606612682343, "eval_runtime": 43.6554, "eval_samples_per_second": 11.339, "eval_steps_per_second": 2.84, "step": 1000 }, { "epoch": 4.444444444444445, "grad_norm": 2.7388453483581543, "learning_rate": 7.036437246963563e-06, "loss": 0.1113, "step": 1100 }, { "epoch": 4.848484848484849, "grad_norm": 1.112307071685791, "learning_rate": 6.766531713900135e-06, "loss": 0.1174, "step": 1200 }, { "epoch": 5.252525252525253, "grad_norm": 0.9322103261947632, "learning_rate": 6.496626180836708e-06, "loss": 0.1025, "step": 1300 }, { "epoch": 5.656565656565657, "grad_norm": 0.03639671951532364, "learning_rate": 6.22672064777328e-06, "loss": 0.0754, "step": 1400 }, { "epoch": 6.0606060606060606, "grad_norm": 0.024523159489035606, "learning_rate": 5.956815114709852e-06, "loss": 0.0614, "step": 1500 }, { "epoch": 6.0606060606060606, "eval_accuracy": 0.965656578540802, "eval_loss": 0.1415119469165802, "eval_runtime": 43.7636, "eval_samples_per_second": 11.311, "eval_steps_per_second": 2.833, "step": 1500 }, { "epoch": 6.4646464646464645, "grad_norm": 0.04833903908729553, "learning_rate": 5.686909581646424e-06, "loss": 0.0296, "step": 1600 }, { "epoch": 6.8686868686868685, "grad_norm": 0.018174033612012863, "learning_rate": 5.417004048582997e-06, "loss": 0.086, "step": 1700 }, { "epoch": 7.2727272727272725, "grad_norm": 0.2865201532840729, "learning_rate": 5.147098515519568e-06, "loss": 0.0671, "step": 1800 }, { "epoch": 7.6767676767676765, "grad_norm": 0.0348142571747303, "learning_rate": 4.877192982456141e-06, "loss": 0.0512, "step": 1900 }, { "epoch": 8.080808080808081, "grad_norm": 0.2693362832069397, "learning_rate": 4.607287449392713e-06, "loss": 0.0349, "step": 2000 }, { "epoch": 8.080808080808081, "eval_accuracy": 0.973737359046936, "eval_loss": 0.13826610147953033, "eval_runtime": 43.9764, "eval_samples_per_second": 11.256, "eval_steps_per_second": 2.82, "step": 2000 }, { "epoch": 8.484848484848484, "grad_norm": 0.033031389117240906, "learning_rate": 4.337381916329285e-06, "loss": 0.0251, "step": 2100 }, { "epoch": 8.88888888888889, "grad_norm": 0.13385087251663208, "learning_rate": 4.067476383265857e-06, "loss": 0.0367, "step": 2200 }, { "epoch": 9.292929292929292, "grad_norm": 41.249717712402344, "learning_rate": 3.7975708502024296e-06, "loss": 0.0501, "step": 2300 }, { "epoch": 9.696969696969697, "grad_norm": 0.02456289902329445, "learning_rate": 3.527665317139002e-06, "loss": 0.0555, "step": 2400 }, { "epoch": 10.1010101010101, "grad_norm": 0.010708549991250038, "learning_rate": 3.2577597840755737e-06, "loss": 0.0143, "step": 2500 }, { "epoch": 10.1010101010101, "eval_accuracy": 0.9818181991577148, "eval_loss": 0.09009132534265518, "eval_runtime": 44.1262, "eval_samples_per_second": 11.218, "eval_steps_per_second": 2.81, "step": 2500 }, { "epoch": 10.505050505050505, "grad_norm": 0.012150867842137814, "learning_rate": 2.9905533063427807e-06, "loss": 0.0486, "step": 2600 }, { "epoch": 10.909090909090908, "grad_norm": 0.008297057822346687, "learning_rate": 2.7206477732793525e-06, "loss": 0.0349, "step": 2700 }, { "epoch": 11.313131313131313, "grad_norm": 0.013805734924972057, "learning_rate": 2.4507422402159244e-06, "loss": 0.0214, "step": 2800 }, { "epoch": 11.717171717171716, "grad_norm": 0.011365755461156368, "learning_rate": 2.180836707152497e-06, "loss": 0.0229, "step": 2900 }, { "epoch": 12.121212121212121, "grad_norm": 0.02992076426744461, "learning_rate": 1.910931174089069e-06, "loss": 0.0178, "step": 3000 }, { "epoch": 12.121212121212121, "eval_accuracy": 0.9777777791023254, "eval_loss": 0.1187622994184494, "eval_runtime": 43.9086, "eval_samples_per_second": 11.273, "eval_steps_per_second": 2.824, "step": 3000 }, { "epoch": 12.525252525252526, "grad_norm": 0.09338176250457764, "learning_rate": 1.6410256410256412e-06, "loss": 0.0252, "step": 3100 }, { "epoch": 12.929292929292929, "grad_norm": 0.27980300784111023, "learning_rate": 1.3738191632928477e-06, "loss": 0.0387, "step": 3200 }, { "epoch": 13.333333333333334, "grad_norm": 0.007048506755381823, "learning_rate": 1.1039136302294197e-06, "loss": 0.0093, "step": 3300 }, { "epoch": 13.737373737373737, "grad_norm": 0.00685643358156085, "learning_rate": 8.34008097165992e-07, "loss": 0.0222, "step": 3400 }, { "epoch": 14.141414141414142, "grad_norm": 0.14381413161754608, "learning_rate": 5.641025641025642e-07, "loss": 0.0222, "step": 3500 }, { "epoch": 14.141414141414142, "eval_accuracy": 0.9777777791023254, "eval_loss": 0.12370182573795319, "eval_runtime": 44.0309, "eval_samples_per_second": 11.242, "eval_steps_per_second": 2.816, "step": 3500 }, { "epoch": 14.545454545454545, "grad_norm": 0.015089770779013634, "learning_rate": 2.941970310391363e-07, "loss": 0.0304, "step": 3600 }, { "epoch": 14.94949494949495, "grad_norm": 1.67782723903656, "learning_rate": 2.4291497975708507e-08, "loss": 0.0173, "step": 3700 }, { "epoch": 14.969696969696969, "step": 3705, "total_flos": 3.163398064220592e+18, "train_loss": 0.13106194541521884, "train_runtime": 5199.6798, "train_samples_per_second": 5.709, "train_steps_per_second": 0.713 } ], "logging_steps": 100, "max_steps": 3705, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.163398064220592e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }