{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "global_step": 11448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 2.0000000000000003e-06, "loss": 86.0421, "step": 100 }, { "epoch": 0.14, "learning_rate": 4.000000000000001e-06, "loss": 51.6706, "step": 200 }, { "epoch": 0.21, "learning_rate": 6e-06, "loss": 41.1349, "step": 300 }, { "epoch": 0.28, "learning_rate": 8.000000000000001e-06, "loss": 36.5061, "step": 400 }, { "epoch": 0.35, "learning_rate": 1e-05, "loss": 33.1858, "step": 500 }, { "epoch": 0.42, "learning_rate": 1.2e-05, "loss": 30.5206, "step": 600 }, { "epoch": 0.49, "learning_rate": 1.4000000000000001e-05, "loss": 28.0073, "step": 700 }, { "epoch": 0.56, "learning_rate": 1.6000000000000003e-05, "loss": 26.1939, "step": 800 }, { "epoch": 0.63, "learning_rate": 1.8e-05, "loss": 24.3465, "step": 900 }, { "epoch": 0.7, "learning_rate": 2e-05, "loss": 22.5759, "step": 1000 }, { "epoch": 0.77, "learning_rate": 2.2000000000000003e-05, "loss": 20.9294, "step": 1100 }, { "epoch": 0.84, "learning_rate": 2.4e-05, "loss": 19.3762, "step": 1200 }, { "epoch": 0.91, "learning_rate": 2.6000000000000002e-05, "loss": 17.72, "step": 1300 }, { "epoch": 0.98, "learning_rate": 2.8000000000000003e-05, "loss": 15.7901, "step": 1400 }, { "epoch": 1.05, "learning_rate": 3e-05, "loss": 14.0008, "step": 1500 }, { "epoch": 1.12, "learning_rate": 3.2000000000000005e-05, "loss": 12.3777, "step": 1600 }, { "epoch": 1.19, "learning_rate": 3.4000000000000007e-05, "loss": 10.7261, "step": 1700 }, { "epoch": 1.26, "learning_rate": 3.6e-05, "loss": 9.1024, "step": 1800 }, { "epoch": 1.33, "learning_rate": 3.8e-05, "loss": 7.4676, "step": 1900 }, { "epoch": 1.4, "learning_rate": 4e-05, "loss": 6.6044, "step": 2000 }, { "epoch": 1.4, "eval_bleu": 3.8045, "eval_em": 0.0, "eval_gen_len": 158.8473, "eval_loss": 6.171305179595947, "eval_runtime": 1556.9786, "eval_samples_per_second": 0.774, "eval_steps_per_second": 0.097, "step": 2000 }, { "epoch": 1.47, "learning_rate": 4.2e-05, "loss": 6.0941, "step": 2100 }, { "epoch": 1.54, "learning_rate": 4.4000000000000006e-05, "loss": 5.6741, "step": 2200 }, { "epoch": 1.61, "learning_rate": 4.600000000000001e-05, "loss": 5.4757, "step": 2300 }, { "epoch": 1.68, "learning_rate": 4.8e-05, "loss": 5.242, "step": 2400 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 5.0108, "step": 2500 }, { "epoch": 1.82, "learning_rate": 4.944121591417077e-05, "loss": 4.8595, "step": 2600 }, { "epoch": 1.89, "learning_rate": 4.888243182834153e-05, "loss": 4.695, "step": 2700 }, { "epoch": 1.96, "learning_rate": 4.8323647742512295e-05, "loss": 4.5706, "step": 2800 }, { "epoch": 2.03, "learning_rate": 4.776486365668306e-05, "loss": 4.2498, "step": 2900 }, { "epoch": 2.1, "learning_rate": 4.720607957085382e-05, "loss": 4.1223, "step": 3000 }, { "epoch": 2.17, "learning_rate": 4.664729548502459e-05, "loss": 4.0181, "step": 3100 }, { "epoch": 2.24, "learning_rate": 4.6088511399195353e-05, "loss": 3.8722, "step": 3200 }, { "epoch": 2.31, "learning_rate": 4.552972731336611e-05, "loss": 3.7786, "step": 3300 }, { "epoch": 2.38, "learning_rate": 4.497094322753688e-05, "loss": 3.6403, "step": 3400 }, { "epoch": 2.45, "learning_rate": 4.4412159141707646e-05, "loss": 3.5437, "step": 3500 }, { "epoch": 2.52, "learning_rate": 4.385337505587841e-05, "loss": 3.389, "step": 3600 }, { "epoch": 2.59, "learning_rate": 4.329459097004918e-05, "loss": 3.2395, "step": 3700 }, { "epoch": 2.66, "learning_rate": 4.2735806884219945e-05, "loss": 3.1786, "step": 3800 }, { "epoch": 2.73, "learning_rate": 4.2177022798390704e-05, "loss": 3.0657, "step": 3900 }, { "epoch": 2.8, "learning_rate": 4.161823871256147e-05, "loss": 3.032, "step": 4000 }, { "epoch": 2.8, "eval_bleu": 27.701, "eval_em": 0.0, "eval_gen_len": 33.9568, "eval_loss": 2.904534339904785, "eval_runtime": 297.5953, "eval_samples_per_second": 4.049, "eval_steps_per_second": 0.507, "step": 4000 }, { "epoch": 2.87, "learning_rate": 4.105945462673223e-05, "loss": 2.8755, "step": 4100 }, { "epoch": 2.94, "learning_rate": 4.0500670540903e-05, "loss": 2.8396, "step": 4200 }, { "epoch": 3.0, "learning_rate": 3.994188645507376e-05, "loss": 2.7454, "step": 4300 }, { "epoch": 3.07, "learning_rate": 3.938310236924452e-05, "loss": 2.5218, "step": 4400 }, { "epoch": 3.14, "learning_rate": 3.882431828341529e-05, "loss": 2.4895, "step": 4500 }, { "epoch": 3.21, "learning_rate": 3.8265534197586055e-05, "loss": 2.4554, "step": 4600 }, { "epoch": 3.28, "learning_rate": 3.7706750111756815e-05, "loss": 2.3573, "step": 4700 }, { "epoch": 3.35, "learning_rate": 3.714796602592758e-05, "loss": 2.2979, "step": 4800 }, { "epoch": 3.42, "learning_rate": 3.658918194009835e-05, "loss": 2.1874, "step": 4900 }, { "epoch": 3.49, "learning_rate": 3.603039785426911e-05, "loss": 2.1803, "step": 5000 }, { "epoch": 3.56, "learning_rate": 3.5471613768439874e-05, "loss": 2.1553, "step": 5100 }, { "epoch": 3.63, "learning_rate": 3.491282968261064e-05, "loss": 2.0567, "step": 5200 }, { "epoch": 3.7, "learning_rate": 3.4354045596781406e-05, "loss": 2.0147, "step": 5300 }, { "epoch": 3.77, "learning_rate": 3.379526151095217e-05, "loss": 1.9817, "step": 5400 }, { "epoch": 3.84, "learning_rate": 3.323647742512294e-05, "loss": 1.8843, "step": 5500 }, { "epoch": 3.91, "learning_rate": 3.26776933392937e-05, "loss": 1.8849, "step": 5600 }, { "epoch": 3.98, "learning_rate": 3.2118909253464465e-05, "loss": 1.8232, "step": 5700 }, { "epoch": 4.05, "learning_rate": 3.156012516763523e-05, "loss": 1.6714, "step": 5800 }, { "epoch": 4.12, "learning_rate": 3.100134108180599e-05, "loss": 1.6047, "step": 5900 }, { "epoch": 4.19, "learning_rate": 3.0442556995976757e-05, "loss": 1.5893, "step": 6000 }, { "epoch": 4.19, "eval_bleu": 57.7509, "eval_em": 0.0033, "eval_gen_len": 44.132, "eval_loss": 1.7326730489730835, "eval_runtime": 426.2734, "eval_samples_per_second": 2.827, "eval_steps_per_second": 0.354, "step": 6000 }, { "epoch": 4.26, "learning_rate": 2.9883772910147524e-05, "loss": 1.5864, "step": 6100 }, { "epoch": 4.33, "learning_rate": 2.9324988824318283e-05, "loss": 1.5608, "step": 6200 }, { "epoch": 4.4, "learning_rate": 2.876620473848905e-05, "loss": 1.5144, "step": 6300 }, { "epoch": 4.47, "learning_rate": 2.8207420652659816e-05, "loss": 1.4582, "step": 6400 }, { "epoch": 4.54, "learning_rate": 2.7648636566830576e-05, "loss": 1.4793, "step": 6500 }, { "epoch": 4.61, "learning_rate": 2.7089852481001342e-05, "loss": 1.472, "step": 6600 }, { "epoch": 4.68, "learning_rate": 2.653106839517211e-05, "loss": 1.4424, "step": 6700 }, { "epoch": 4.75, "learning_rate": 2.597228430934287e-05, "loss": 1.3779, "step": 6800 }, { "epoch": 4.82, "learning_rate": 2.5413500223513638e-05, "loss": 1.3611, "step": 6900 }, { "epoch": 4.89, "learning_rate": 2.48547161376844e-05, "loss": 1.3311, "step": 7000 }, { "epoch": 4.96, "learning_rate": 2.4295932051855164e-05, "loss": 1.3164, "step": 7100 }, { "epoch": 5.03, "learning_rate": 2.373714796602593e-05, "loss": 1.2119, "step": 7200 }, { "epoch": 5.1, "learning_rate": 2.3178363880196693e-05, "loss": 1.1122, "step": 7300 }, { "epoch": 5.17, "learning_rate": 2.2619579794367456e-05, "loss": 1.1198, "step": 7400 }, { "epoch": 5.24, "learning_rate": 2.206079570853822e-05, "loss": 1.0416, "step": 7500 }, { "epoch": 5.31, "learning_rate": 2.1502011622708985e-05, "loss": 1.1042, "step": 7600 }, { "epoch": 5.38, "learning_rate": 2.0943227536879752e-05, "loss": 1.0715, "step": 7700 }, { "epoch": 5.45, "learning_rate": 2.0384443451050515e-05, "loss": 1.0815, "step": 7800 }, { "epoch": 5.52, "learning_rate": 1.982565936522128e-05, "loss": 1.0445, "step": 7900 }, { "epoch": 5.59, "learning_rate": 1.9266875279392044e-05, "loss": 1.0512, "step": 8000 }, { "epoch": 5.59, "eval_bleu": 71.1272, "eval_em": 0.0241, "eval_gen_len": 46.0672, "eval_loss": 1.2382431030273438, "eval_runtime": 358.1089, "eval_samples_per_second": 3.365, "eval_steps_per_second": 0.422, "step": 8000 }, { "epoch": 5.66, "learning_rate": 1.8708091193562807e-05, "loss": 1.0201, "step": 8100 }, { "epoch": 5.73, "learning_rate": 1.8149307107733573e-05, "loss": 1.0377, "step": 8200 }, { "epoch": 5.8, "learning_rate": 1.7590523021904336e-05, "loss": 0.986, "step": 8300 }, { "epoch": 5.87, "learning_rate": 1.70317389360751e-05, "loss": 1.0244, "step": 8400 }, { "epoch": 5.94, "learning_rate": 1.6472954850245866e-05, "loss": 0.9654, "step": 8500 }, { "epoch": 6.01, "learning_rate": 1.5914170764416632e-05, "loss": 0.9454, "step": 8600 }, { "epoch": 6.08, "learning_rate": 1.5355386678587395e-05, "loss": 0.8179, "step": 8700 }, { "epoch": 6.15, "learning_rate": 1.479660259275816e-05, "loss": 0.8433, "step": 8800 }, { "epoch": 6.22, "learning_rate": 1.4237818506928924e-05, "loss": 0.8235, "step": 8900 }, { "epoch": 6.29, "learning_rate": 1.3679034421099687e-05, "loss": 0.832, "step": 9000 }, { "epoch": 6.36, "learning_rate": 1.312025033527045e-05, "loss": 0.8019, "step": 9100 }, { "epoch": 6.43, "learning_rate": 1.2561466249441217e-05, "loss": 0.806, "step": 9200 }, { "epoch": 6.5, "learning_rate": 1.2002682163611981e-05, "loss": 0.7985, "step": 9300 }, { "epoch": 6.57, "learning_rate": 1.1443898077782746e-05, "loss": 0.801, "step": 9400 }, { "epoch": 6.64, "learning_rate": 1.0885113991953509e-05, "loss": 0.7758, "step": 9500 }, { "epoch": 6.71, "learning_rate": 1.0326329906124274e-05, "loss": 0.804, "step": 9600 }, { "epoch": 6.78, "learning_rate": 9.767545820295038e-06, "loss": 0.7846, "step": 9700 }, { "epoch": 6.85, "learning_rate": 9.208761734465803e-06, "loss": 0.7736, "step": 9800 }, { "epoch": 6.92, "learning_rate": 8.649977648636568e-06, "loss": 0.7625, "step": 9900 }, { "epoch": 6.99, "learning_rate": 8.09119356280733e-06, "loss": 0.7515, "step": 10000 }, { "epoch": 6.99, "eval_bleu": 75.7356, "eval_em": 0.0432, "eval_gen_len": 47.2896, "eval_loss": 1.057088017463684, "eval_runtime": 378.6301, "eval_samples_per_second": 3.183, "eval_steps_per_second": 0.399, "step": 10000 }, { "epoch": 7.06, "learning_rate": 7.532409476978096e-06, "loss": 0.6703, "step": 10100 }, { "epoch": 7.13, "learning_rate": 6.973625391148861e-06, "loss": 0.6731, "step": 10200 }, { "epoch": 7.2, "learning_rate": 6.414841305319625e-06, "loss": 0.6584, "step": 10300 }, { "epoch": 7.27, "learning_rate": 5.856057219490389e-06, "loss": 0.6758, "step": 10400 }, { "epoch": 7.34, "learning_rate": 5.297273133661153e-06, "loss": 0.6801, "step": 10500 }, { "epoch": 7.41, "learning_rate": 4.738489047831918e-06, "loss": 0.6556, "step": 10600 }, { "epoch": 7.48, "learning_rate": 4.1797049620026825e-06, "loss": 0.6413, "step": 10700 }, { "epoch": 7.55, "learning_rate": 3.6209208761734468e-06, "loss": 0.6548, "step": 10800 }, { "epoch": 7.62, "learning_rate": 3.062136790344211e-06, "loss": 0.6631, "step": 10900 }, { "epoch": 7.69, "learning_rate": 2.5033527045149757e-06, "loss": 0.6458, "step": 11000 }, { "epoch": 7.76, "learning_rate": 1.94456861868574e-06, "loss": 0.629, "step": 11100 }, { "epoch": 7.83, "learning_rate": 1.3857845328565042e-06, "loss": 0.643, "step": 11200 }, { "epoch": 7.9, "learning_rate": 8.270004470272687e-07, "loss": 0.6475, "step": 11300 }, { "epoch": 7.97, "learning_rate": 2.682163611980331e-07, "loss": 0.633, "step": 11400 }, { "epoch": 8.0, "step": 11448, "total_flos": 7528840539566280.0, "train_loss": 0.15130291744847968, "train_runtime": 1637.6947, "train_samples_per_second": 111.816, "train_steps_per_second": 6.99 } ], "max_steps": 11448, "num_train_epochs": 8, "total_flos": 7528840539566280.0, "trial_name": null, "trial_params": null }